获取内容(URL)并运行Pregmatch all,然后按指定打印结果

时间:2014-08-05 23:16:51

标签: php html xpath web-scraping domdocument

我正在试图搜索公寓的craigslist。

代码:

$city = 'saltlakecity';
$rooms = '';
$query = '';
$sdate ='';
$url = 'http://'.$city.'.craigslist.org/search/apa?bedrooms='.$rooms.'&query='.$query.'&sale_date='.$sdate.'';
$base_url = parse_url($url, PHP_URL_HOST);
$resultspage = file_get_contents($url);

// use DOMDocument and DOMXpath
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultspage);
libxml_clear_errors();
$xpath = new DOMXpath($dom);

$data = array();
$rows = $xpath->query('//p[@class="row"]'); // get all rows
foreach($rows as $entries) { // loop each row
$entry = array();
$entry['title'] = $xpath->query('./span[@class="txt"]/span[@class="pl"]/a', $entries)->item(0)->nodeValue;
$entry['link'] = 'http://' . $base_url . $xpath->query('./a[@class="i"]', $entries)->item(0)->getAttribute('href');
$entry['price'] = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]', $entries)->item(0)->nodeValue;
$location = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[2]', $entries)->item(0)->nodeValue;
$loc = str_replace(array('(', ')'), '', $location);
$entry['location'] = $loc;
$entry['seller'] = $xpath->query('./span[@class="txt"]/span[@class="l2"]/a', $entries)->item(0)->nodeValue;

$url2 = $entry['link'];
$listingpage = file_get_contents($url2);
$dom2 = new DOMDocument();
libxml_use_internal_errors(true);
$dom2->loadHTML($listingpage);
libxml_clear_errors();
$xpath2 = new DOMXpath($dom2);
$entry['address'] = $xpath2->query('./div[@class="mapAndAttrs"]/div[3]')->item(0)->nodeValue;

$text_node = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]/following-sibling::text()[1]', $entries)->item(0)->nodeValue;
// remove "/"" and "-""  | explode by space | filter space (now, its left by 2 values: bedroom and size)
$text_node = array_filter(explode(' ', str_replace(array('/', '-'), '', $text_node)));
$entry['bedrooms'] = array_shift($text_node); // bedroom
$entry['dimensions'] = array_shift($text_node); // dimensions

$data[] = $entry; // after gathering necessary items, assign inside
}

echo '<pre>';
print_r($data);

**更新:我现在正试图抓取被删除的链接,获取该地产的地址**

我想要完成的是有一个pregmatch找到标题,URL,多少卧室,它所在的城市,以及价格然后打印出来。但是,如果我只是将“$ matches”放入页面放置数组。如果我把上面的代码放在上面,页面会加载白色。

有人可以查看我的代码并告诉我这里可能做错了什么吗? 谢谢!

1 个答案:

答案 0 :(得分:1)

我谦卑地建议使用DOMDocument而不是正则表达式使用DOMXpath使用适当的工具(HTML解析器)。示例:Sample Fiddle

$city = 'saltlakecity';
$url = "http://".$city.".craigslist.org/search/apa/?bedrooms=2&hasPic=1&query=";
$base_url = parse_url($url, PHP_URL_HOST);
$resultspage = file_get_contents($url);

// use DOMDocument and DOMXpath
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultspage);
libxml_clear_errors();
$xpath = new DOMXpath($dom);

$data = array();
$rows = $xpath->query('//p[@class="row"]'); // get all rows
foreach($rows as $entries) { // loop each row
    $entry = array();
    $entry['title'] = $xpath->query('./span[@class="txt"]/span[@class="pl"]/a', $entries)->item(0)->nodeValue;
    $entry['link'] = 'http://' . $base_url . $xpath->query('./a[@class="i"]', $entries)->item(0)->getAttribute('href');
    $entry['price'] = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]', $entries)->item(0)->nodeValue;
    $text_node = $xpath->query('./span[@class="txt"]/span[@class="l2"]/span[1]/following-sibling::text()[1]', $entries)->item(0)->nodeValue;
    // remove "/"" and "-""  | explode by space | filter space (now, its left by 2 values: bedroom and size)
    $text_node = array_filter(explode(' ', str_replace(array('/', '-'), '', $text_node)));
    $entry['bedrooms'] = array_shift($text_node); // bedroom
    $entry['dimensions'] = array_shift($text_node); // dimensions

    $address = @$xpath->query('./span[@class="txt"]/span[@class="l2"]/span[@class="pnr"]/small', $entries)->item(0)->nodeValue;
    $address = str_replace(array('(', ')'), '', $address);
    $entry['address'] = $address;

    $data[] = $entry; // after gathering necessary items, assign inside
}

echo '<pre>';
print_r($data);

应该输出:

Array
(
    [0] => Array
        (
            [title] => Beautiful Spacious Sandy Home for rent
            [link] => http://saltlakecity.craigslist.org/apa/4605359897.html
            [price] => $2050
            [bedrooms] => 6br
            [dimensions] => 3710ft²
            [address] =>  10251 Snow Iris Way, Sandy
        )
    and many more ...