我正在试图搜索公寓的craigslist。
代码:
$city = 'saltlakecity';
$rooms = '';
$query = '';
$sdate ='';
$url = 'http://'.$city.'.craigslist.org/search/apa?bedrooms='.$rooms.'&query='.$query.'&sale_date='.$sdate.'';
$base_url = parse_url($url, PHP_URL_HOST);
$resultspage = file_get_contents($url);
// use DOMDocument and DOMXpath
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultspage);
libxml_clear_errors();
$xpath = new DOMXpath($dom);
$data = array();
$rows = $xpath->query('//p[@class="row"]'); // get all rows
foreach($rows as $entries) { // loop each row
$entry = array();
$entry['title'] = $xpath->query('./span[@class="txt"]/span[@class="pl"]/a', $entries)->item(0)->nodeValue;
$entry['link'] = 'http://' . $base_url . $xpath->query('./a[@class="i"]', $entries)->item(0)->getAttribute('href');
$entry['price'] = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]', $entries)->item(0)->nodeValue;
$location = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[2]', $entries)->item(0)->nodeValue;
$loc = str_replace(array('(', ')'), '', $location);
$entry['location'] = $loc;
$entry['seller'] = $xpath->query('./span[@class="txt"]/span[@class="l2"]/a', $entries)->item(0)->nodeValue;
$url2 = $entry['link'];
$listingpage = file_get_contents($url2);
$dom2 = new DOMDocument();
libxml_use_internal_errors(true);
$dom2->loadHTML($listingpage);
libxml_clear_errors();
$xpath2 = new DOMXpath($dom2);
$entry['address'] = $xpath2->query('./div[@class="mapAndAttrs"]/div[3]')->item(0)->nodeValue;
$text_node = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]/following-sibling::text()[1]', $entries)->item(0)->nodeValue;
// remove "/"" and "-"" | explode by space | filter space (now, its left by 2 values: bedroom and size)
$text_node = array_filter(explode(' ', str_replace(array('/', '-'), '', $text_node)));
$entry['bedrooms'] = array_shift($text_node); // bedroom
$entry['dimensions'] = array_shift($text_node); // dimensions
$data[] = $entry; // after gathering necessary items, assign inside
}
echo '<pre>';
print_r($data);
**更新:我现在正试图抓取被删除的链接,获取该地产的地址**
我想要完成的是有一个pregmatch找到标题,URL,多少卧室,它所在的城市,以及价格然后打印出来。但是,如果我只是将“$ matches”放入页面放置数组。如果我把上面的代码放在上面,页面会加载白色。
有人可以查看我的代码并告诉我这里可能做错了什么吗? 谢谢!
答案 0 :(得分:1)
我谦卑地建议使用DOMDocument
而不是正则表达式使用DOMXpath
使用适当的工具(HTML解析器)。示例:Sample Fiddle
$city = 'saltlakecity';
$url = "http://".$city.".craigslist.org/search/apa/?bedrooms=2&hasPic=1&query=";
$base_url = parse_url($url, PHP_URL_HOST);
$resultspage = file_get_contents($url);
// use DOMDocument and DOMXpath
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultspage);
libxml_clear_errors();
$xpath = new DOMXpath($dom);
$data = array();
$rows = $xpath->query('//p[@class="row"]'); // get all rows
foreach($rows as $entries) { // loop each row
$entry = array();
$entry['title'] = $xpath->query('./span[@class="txt"]/span[@class="pl"]/a', $entries)->item(0)->nodeValue;
$entry['link'] = 'http://' . $base_url . $xpath->query('./a[@class="i"]', $entries)->item(0)->getAttribute('href');
$entry['price'] = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]', $entries)->item(0)->nodeValue;
$text_node = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]/following-sibling::text()[1]', $entries)->item(0)->nodeValue;
// remove "/"" and "-"" | explode by space | filter space (now, its left by 2 values: bedroom and size)
$text_node = array_filter(explode(' ', str_replace(array('/', '-'), '', $text_node)));
$entry['bedrooms'] = array_shift($text_node); // bedroom
$entry['dimensions'] = array_shift($text_node); // dimensions
$address = @$xpath->query('./span[@class="txt"]/span[@class="l2"]/span[@class="pnr"]/small', $entries)->item(0)->nodeValue;
$address = str_replace(array('(', ')'), '', $address);
$entry['address'] = $address;
$data[] = $entry; // after gathering necessary items, assign inside
}
echo '<pre>';
print_r($data);
应该输出:
Array
(
[0] => Array
(
[title] => Beautiful Spacious Sandy Home for rent
[link] => http://saltlakecity.craigslist.org/apa/4605359897.html
[price] => $2050
[bedrooms] => 6br
[dimensions] => 3710ft²
[address] => 10251 Snow Iris Way, Sandy
)
and many more ...