python scraper无法产生物品

时间:2015-03-23 12:08:07

标签: python scrapy scraper scrapy-spider

我的spider.py是这样的:

class CSpider(scraper.Spider):
    name = 'craig'
    start_urls = ['http://geo.craigslist.org/iso/us/ca']

    def parse(self, response):
        # get url_list
        for url in url_list:
            yield scrapy.Request(url, self.method1)

    def method1(self, response):
        # get another url_list
        for url in url_list:
            yield scrapy.Request(url, self.method2)

    def method2(self, response):
        # populate item
        yield item

在此,我的蜘蛛method2填充了该项目。 method2的回复取决于method1method1的回复取决于parse

我的蜘蛛经过parsemethod1,但没有进入method2,这是产生物品的主要方法。你能告诉我在哪里弄错了吗?

1 个答案:

答案 0 :(得分:1)

我从method1(parse_item)发出请求时找不到任何错误。

def parse_item(self, response):
    # from scrapy.shell import inspect_response
    # inspect_response(response)
    item = Advertisement()
    location = response.url.split('/')[2].split('.')[0]
    detail_link_prefix = 'http://' + response.url.split('/')[2]
    selectors = response.xpath('//span[@class="pl"]')
    # pdb.set_trace()
    for sel in selectors:
        date_str = sel.xpath('//span[@class="pl"]/time/@datetime').extract()[0]
        item['post_datetime'] = to_datetime_object(date_str)
        item['post_title'] = sel.xpath('a/text()').extract()[0]
        extracted_link = sel.xpath('a/@href').extract()[0]

        # Check whether the url is absolute or relative as it varies
        # pdb.set_trace()
        if extracted_link.split('/')[0] == u'http:':
            detail_link = extracted_link
        else:
            detail_link = detail_link_prefix + extracted_link

        item['post_detail_link'] = detail_link
        price = sel.xpath('following-sibling::*/span[@class="price"]/text()').extract()
        if len(price) != 0:
            item['price'] = price[0]
        else:
            item['price'] = 'Not Mentioned'
        item['location'] = location
        # yield item
        yield scrapy.Request(detail_link, self.parse_item_details, meta={'item': item})

def parse_item_details(self, response):
    # from scrapy.shell import inspect_response
    # inspect_response(response)
    item = response.meta['item']
    item['model_year'] = response.xpath('//p[@class="attrgroup"]/span')[0].xpath('b/text()').extract()[0]
    return item
items.py中缺少

model_year 字段 所以items.py看起来像

class Advertisement(scrapy.Item):
    post_datetime = scrapy.Field()
    post_title = scrapy.Field()
    price = scrapy.Field()
    post_detail_link = scrapy.Field()
    location = scrapy.Field()
    model_year = scrapy.Field()

在此处附加示例输出

{"model_year": "2009 Ford Ranger", "post_title": "2009 Ford Ranger     pickup w/shell", "price": "$8495", "post_detail_link": "http://redding.craigslist.org/cto/4856643697.html", "location": "siskiyou", "post_datetime": "2015-03-23 17:41:00"}, 
{"model_year": "2011 ford ranger", "post_title": "[UPDATE]2006 ford ranger", "price": "$5500", "post_detail_link": "http://yubasutter.craigslist.org/cto/4883561404.html", "location": "sacramento", "post_datetime": "2015-03-24 21:43:00"},