scrapy爬行嵌套网址

时间:2016-05-20 11:18:13

标签: python scrapy

我是蹩脚的嵌套网址使用CrawlSpider,但scrapy抓取项目之后,sidout发现错误:

 yield request(url=urls,callback=self.parse_netsted_item)
TypeError: 'module' object is not callable

carspider.py:

# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
from scrapy.http import request


class CarSpider (CrawlSpider):
    name ='car'
    allowed_domains = ['58.com']
    start_urls = ['http://quanguo.58.com/ershouche']
    rules = [Rule(LinkExtractor(allow=('/pn\d+')),callback='parse_item',follow=True)] #//页面读取策略

    items = {}

    def parse_item(self,response):
        trs = response.xpath("//div[@id='infolist']/table[@class='tbimg']/tr")
        for tr in trs:
            item = Car58Item()
            urls = tr.xpath("td[@class='img']/a/@href").extract()
            item['url'] = tr.xpath("td[@class='img']/a/@href").extract()
            item['tip'] = tr.xpath("td[@class='t']/a/font/text()").extract()
            item['name'] = tr.xpath("td[@class='t']/a[1]/text()").extract()
            item['size'] = tr.xpath("td[@class='t']/p/text()").extract()
            item['region'] = tr.xpath("td[@class='tc']/a/text()").extract()
            item['amt'] = tr.xpath("td[@class='tc']/b/text()").extract()
            yield request(url=urls,callback=self.parse_netsted_item,meta={'item':item})

    def parse_netsted_item(self,response):
        mode = response.xpath("//body")
        item = response.meta['item']
        item['lianxiren'] = mode.xpath("//div[@id='content_sumary_right']/p[1]/span[2]/a/text()").extract()
        item['lianxiren_dh'] = mode.xpath("//div[@id='content_sumary_right']/p[2]/span[2]/text()").extract()
        item['lianxiren_dz'] = mode.xpath("//div[@id='content_sumary_right']/p[3]/span[2]/text()").extract()
        item['details'] = mode.xpath("//div[@id='nonecur']/ul").extract()
        item['description'] = mode.xpath("//div[@id='nonecur']/div[4]/div/text()").extract()

        item['wzgl'] = mode.xpath("//div[@id='nonecur']/ul/li[1]/span[2]/text()").extract()
        item['time'] = mode.xpath("//div[@id='nonecur']/ul/li[2]/span[2]/text()").extract()
        item['lc'] = mode.xpath("//div[@id='nonecur']/ul/li[3]/span[2]/text()").extract()
        item['pl'] = mode.xpath("//div[@id='nonecur']/ul/li[4]/span[2]/text()").extract()
        item['bsx'] = mode.xpath("//div[@id='nonecur']/ul/li[5]/span[2]/text()").extract()
        item['ys'] = mode.xpath("//div[@id='nonecur']/ul/li[6]/span[2]/text()").extract()
        item['njdq'] = mode.xpath("//div[@id='nonecur']/ul/li[7]/span[2]/text()").extract()
        item['jqx'] = mode.xpath("//div[@id='nonecur']/ul/li[8]/span[2]/text()").extract()
        item['whby'] = mode.xpath("//div[@id='nonecur']/ul/li[9]/span[2]/text()").extract()
        item['sgls'] = mode.xpath("//div[@id='nonecur']/ul/li[10]/span[2]/text()").extract()
        item['qdfs'] = mode.xpath("//div[@id='nonecur']/ul/li[11]/span[2]/text()").extract()
        item['size ='] = mode.xpath("//div[@id='nonecur']/ul/li[12]/span[2]/text()").extract()

        return item
shell命令中的

$scrapy crawl car

输出如下:

016-05-20 18:55:50 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn4/> (referer: http://quanguo.58.com/ershouche/)
2016-05-20 18:55:50 [scrapy] ERROR: Spider error processing <GET http://quanguo.58.com/ershouche/pn4/> (referer: http://quanguo.58.com/ershouche/)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
    yield next(it)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output
    for x in result:
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spiders/crawl.py", line 69, in _parse_response
    for requests_or_item in iterate_spider_output(cb_res):
  File "/Users/mayuping/PycharmProjects/car/car/spiders/car.py", line 26, in parse_item
  

yield request(url = urls,callback = self.parse_netsted_item)       TypeError:'module'对象不可调用

^C2016-05-20 18:55:51 [scrapy] INFO: Received SIGINT twice, forcing unclean shutdown

1 个答案:

答案 0 :(得分:1)

您使用了错误的方法来呼叫请求。您应该使用scrapy.Request

此外,您尝试通过1次调用请求多个网址,您必须将其循环

import scrapy
for link in urls:
    yield scrapy.Request(url=link,meta={'item':item}, callback=self.parse_netsted_item)

那应该有用