我已经编写了以下蜘蛛来抓取http://www.funda.nl/上的页面:
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem
class FundaSpider(CrawlSpider):
name = "funda_spider"
allowed_domains = ["funda.nl"]
def __init__(self, place='amsterdam'):
self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1,3001)]
self.base_url = "http://www.funda.nl/koop/%s/" % place
self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url)
def parse(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
if re.search(r'/appartement-',link.url):
item['property_type'] = "apartment"
elif re.search(r'/huis-',link.url):
item['property_type'] = "house"
yield scrapy.Request(link.url, callback=self.parse_dir_contents, meta={'item': item})
def parse_dir_contents(self, response):
new_item = response.request.meta['item']
title = response.xpath('//title/text()').extract()[0]
postal_code = re.search(r'\d{4} [A-Z]{2}', title).group(0)
address = re.findall(r'te koop: (.*) \d{4}',title)[0]
price_dd = response.xpath("//dt[contains(.,'Vraagprijs')]/following-sibling::dd[1]/text()").extract()[0]
price = re.findall(r' \d+.\d+', price_dd)[0].strip()
year_built_dd = response.xpath("//dt[contains(.,'Bouwjaar')]/following-sibling::dd[1]/text()").extract()[0]
year_built = re.findall(r'\d+', year_built_dd)[0]
area_dd = response.xpath("//dt[contains(.,'Woonoppervlakte')]/following-sibling::dd[1]/text()").extract()[0]
area = re.findall(r'\d+', area_dd)[0]
rooms_dd = response.xpath("//dt[contains(.,'Aantal kamers')]/following-sibling::dd[1]/text()").extract()[0]
rooms = re.findall('\d+ kamer',rooms_dd)[0].replace(' kamer','')
bedrooms = re.findall('\d+ slaapkamer',rooms_dd)[0].replace(' slaapkamer','')
new_item['postal_code'] = postal_code
new_item['address'] = address
new_item['price'] = price
new_item['year_built'] = year_built
new_item['area'] = area
new_item['rooms'] = rooms
new_item['bedrooms'] = bedrooms
yield new_item
其中FundaItem
定义为
import scrapy
class FundaItem(scrapy.Item):
# define the fields for your item here like:
url = scrapy.Field()
title = scrapy.Field()
address = scrapy.Field()
postal_code = scrapy.Field()
price = scrapy.Field() # Listing price ("Vraagprijs")
year_built = scrapy.Field() # Year built ("Bouwjaar")
area = scrapy.Field() # Built area ("Woonoppervlakte")
rooms = scrapy.Field() # Number of rooms
bedrooms = scrapy.Field() # Number of bedrooms
property_type = scrapy.Field() # House or apartment
例如,如果我使用命令
运行它scrapy crawl funda_spider -a place=amsterdam -o amsterdam.json
然后我得到一个719 KB的JSON文件,其开头如下:
[
{"year_built": "1984", "area": "31", "url": "http://www.funda.nl/koop/amsterdam/appartement-49800928-jan-muschstraat-8/", "price": "132.500", "bedrooms": "1", "postal_code": "1065 LX", "rooms": "1", "address": "Jan Muschstraat 8", "property_type": "apartment"},
{"year_built": "1990", "area": "79", "url": "http://www.funda.nl/koop/amsterdam/appartement-85255640-zeeburgerkade-738-pp/", "price": "300.000", "bedrooms": "1", "postal_code": "1019 HT", "rooms": "2", "address": "Zeeburgerkade 738 +PP", "property_type": "apartment"},
{"year_built": "1906", "area": "93", "url": "http://www.funda.nl/koop/amsterdam/appartement-49897032-cliffordstraat-22-huis/", "price": "550.000", "bedrooms": "3", "postal_code": "1051 GT", "rooms": "4", "address": "Cliffordstraat 22 -HUIS", "property_type": "apartment"},
通过指定关键字" amsterdam",我会从http://www.funda.nl/koop/amsterdam/抓取所有房屋和公寓。
到目前为止,这么好。但Funda也有整个省份的页面,例如http://www.funda.nl/koop/provincie-zuid-holland/。如果我试图用
来刮掉它scrapy crawl funda_spider -a place=provincie-zuid-holland -o zuid_holland.json
我得到一个空的JSON文件:
[
我怀疑的是,对于阿姆斯特丹以外的某个房屋或公寓,解析会以某种方式出错,导致整个JSON输出为空。我怎样才能让蜘蛛也为Zuid-Holland省产出产量?
答案 0 :(得分:0)
问题是您用来过滤网址的正则表达式,LinkExtractor
找不到与之匹配的网址,但是如果您将其更改为LinkExtractor(allow=r'(huis|appartement)-\d{8}')
(不知道是否这样)是你想要的)会给你类似的东西:
['http://www.funda.nl/koop/dordrecht/appartement-49650412-johanna-naber-erf-448/',
'http://www.funda.nl/koop/den-haag/appartement-49805676-moerweg-210/',
'http://www.funda.nl/koop/gouda/huis-49826340-baljuwslag-1/',
'http://www.funda.nl/koop/hillegom/huis-49825295-de-kwekerij-3/',
'http://www.funda.nl/koop/spijkenisse/huis-49825133-pampasgras-27/',
'http://www.funda.nl/koop/leiden/huis-49825907-vlietweg-11/',
'http://www.funda.nl/koop/dordrecht/huis-49825879-driehoek-6/',
'http://www.funda.nl/koop/zevenhuizen-zh/huis-49825567-kratonlaan-2/',
'http://www.funda.nl/koop/voorhout/huis-49814862-ludolph-bohlenstraat-21/',
'http://www.funda.nl/koop/brielle/huis-85278226-koningsdiep-23/',
'http://www.funda.nl/koop/middelharnis/huis-49814415-prins-bernhardlaan-124/',
'http://www.funda.nl/koop/den-haag/huis-49814404-van-veendijk-8/',
'http://www.funda.nl/koop/alphen-aan-den-rijn/huis-49814472-barentszstraat-29/',
'http://www.funda.nl/koop/hazerswoude-rijndijk/huis-49813001-rijndijk-123/',
'http://www.funda.nl/koop/schiedam/huis-49812284-singel-94/',
'http://www.funda.nl/koop/alphen-aan-den-rijn/huis-49812863-gouwsluisseweg-91/',
'http://www.funda.nl/koop/voorburg/huis-49811030-charlotte-van-pallandtlaan-23/']