我的spider.py是这样的:
class CSpider(scraper.Spider):
name = 'craig'
start_urls = ['http://geo.craigslist.org/iso/us/ca']
def parse(self, response):
# get url_list
for url in url_list:
yield scrapy.Request(url, self.method1)
def method1(self, response):
# get another url_list
for url in url_list:
yield scrapy.Request(url, self.method2)
def method2(self, response):
# populate item
yield item
在此,我的蜘蛛method2
填充了该项目。
method2
的回复取决于method1
,method1
的回复取决于parse
。
我的蜘蛛经过parse
和method1
,但没有进入method2
,这是产生物品的主要方法。你能告诉我在哪里弄错了吗?
答案 0 :(得分:1)
我从method1(parse_item)发出请求时找不到任何错误。
def parse_item(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response)
item = Advertisement()
location = response.url.split('/')[2].split('.')[0]
detail_link_prefix = 'http://' + response.url.split('/')[2]
selectors = response.xpath('//span[@class="pl"]')
# pdb.set_trace()
for sel in selectors:
date_str = sel.xpath('//span[@class="pl"]/time/@datetime').extract()[0]
item['post_datetime'] = to_datetime_object(date_str)
item['post_title'] = sel.xpath('a/text()').extract()[0]
extracted_link = sel.xpath('a/@href').extract()[0]
# Check whether the url is absolute or relative as it varies
# pdb.set_trace()
if extracted_link.split('/')[0] == u'http:':
detail_link = extracted_link
else:
detail_link = detail_link_prefix + extracted_link
item['post_detail_link'] = detail_link
price = sel.xpath('following-sibling::*/span[@class="price"]/text()').extract()
if len(price) != 0:
item['price'] = price[0]
else:
item['price'] = 'Not Mentioned'
item['location'] = location
# yield item
yield scrapy.Request(detail_link, self.parse_item_details, meta={'item': item})
def parse_item_details(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response)
item = response.meta['item']
item['model_year'] = response.xpath('//p[@class="attrgroup"]/span')[0].xpath('b/text()').extract()[0]
return item
items.py中缺少model_year 字段 所以items.py看起来像
class Advertisement(scrapy.Item):
post_datetime = scrapy.Field()
post_title = scrapy.Field()
price = scrapy.Field()
post_detail_link = scrapy.Field()
location = scrapy.Field()
model_year = scrapy.Field()
在此处附加示例输出
{"model_year": "2009 Ford Ranger", "post_title": "2009 Ford Ranger pickup w/shell", "price": "$8495", "post_detail_link": "http://redding.craigslist.org/cto/4856643697.html", "location": "siskiyou", "post_datetime": "2015-03-23 17:41:00"},
{"model_year": "2011 ford ranger", "post_title": "[UPDATE]2006 ford ranger", "price": "$5500", "post_detail_link": "http://yubasutter.craigslist.org/cto/4883561404.html", "location": "sacramento", "post_datetime": "2015-03-24 21:43:00"},