我是蹩脚的嵌套网址使用CrawlSpider,但scrapy抓取项目之后,sidout发现错误:
yield request(url=urls,callback=self.parse_netsted_item)
TypeError: 'module' object is not callable
carspider.py:
# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
from scrapy.http import request
class CarSpider (CrawlSpider):
name ='car'
allowed_domains = ['58.com']
start_urls = ['http://quanguo.58.com/ershouche']
rules = [Rule(LinkExtractor(allow=('/pn\d+')),callback='parse_item',follow=True)] #//页面读取策略
items = {}
def parse_item(self,response):
trs = response.xpath("//div[@id='infolist']/table[@class='tbimg']/tr")
for tr in trs:
item = Car58Item()
urls = tr.xpath("td[@class='img']/a/@href").extract()
item['url'] = tr.xpath("td[@class='img']/a/@href").extract()
item['tip'] = tr.xpath("td[@class='t']/a/font/text()").extract()
item['name'] = tr.xpath("td[@class='t']/a[1]/text()").extract()
item['size'] = tr.xpath("td[@class='t']/p/text()").extract()
item['region'] = tr.xpath("td[@class='tc']/a/text()").extract()
item['amt'] = tr.xpath("td[@class='tc']/b/text()").extract()
yield request(url=urls,callback=self.parse_netsted_item,meta={'item':item})
def parse_netsted_item(self,response):
mode = response.xpath("//body")
item = response.meta['item']
item['lianxiren'] = mode.xpath("//div[@id='content_sumary_right']/p[1]/span[2]/a/text()").extract()
item['lianxiren_dh'] = mode.xpath("//div[@id='content_sumary_right']/p[2]/span[2]/text()").extract()
item['lianxiren_dz'] = mode.xpath("//div[@id='content_sumary_right']/p[3]/span[2]/text()").extract()
item['details'] = mode.xpath("//div[@id='nonecur']/ul").extract()
item['description'] = mode.xpath("//div[@id='nonecur']/div[4]/div/text()").extract()
item['wzgl'] = mode.xpath("//div[@id='nonecur']/ul/li[1]/span[2]/text()").extract()
item['time'] = mode.xpath("//div[@id='nonecur']/ul/li[2]/span[2]/text()").extract()
item['lc'] = mode.xpath("//div[@id='nonecur']/ul/li[3]/span[2]/text()").extract()
item['pl'] = mode.xpath("//div[@id='nonecur']/ul/li[4]/span[2]/text()").extract()
item['bsx'] = mode.xpath("//div[@id='nonecur']/ul/li[5]/span[2]/text()").extract()
item['ys'] = mode.xpath("//div[@id='nonecur']/ul/li[6]/span[2]/text()").extract()
item['njdq'] = mode.xpath("//div[@id='nonecur']/ul/li[7]/span[2]/text()").extract()
item['jqx'] = mode.xpath("//div[@id='nonecur']/ul/li[8]/span[2]/text()").extract()
item['whby'] = mode.xpath("//div[@id='nonecur']/ul/li[9]/span[2]/text()").extract()
item['sgls'] = mode.xpath("//div[@id='nonecur']/ul/li[10]/span[2]/text()").extract()
item['qdfs'] = mode.xpath("//div[@id='nonecur']/ul/li[11]/span[2]/text()").extract()
item['size ='] = mode.xpath("//div[@id='nonecur']/ul/li[12]/span[2]/text()").extract()
return item
shell命令中的:
$scrapy crawl car
输出如下:
016-05-20 18:55:50 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn4/> (referer: http://quanguo.58.com/ershouche/)
2016-05-20 18:55:50 [scrapy] ERROR: Spider error processing <GET http://quanguo.58.com/ershouche/pn4/> (referer: http://quanguo.58.com/ershouche/)
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output
for x in result:
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spiders/crawl.py", line 69, in _parse_response
for requests_or_item in iterate_spider_output(cb_res):
File "/Users/mayuping/PycharmProjects/car/car/spiders/car.py", line 26, in parse_item
yield request(url = urls,callback = self.parse_netsted_item) TypeError:'module'对象不可调用
^C2016-05-20 18:55:51 [scrapy] INFO: Received SIGINT twice, forcing unclean shutdown
答案 0 :(得分:1)
您使用了错误的方法来呼叫请求。您应该使用scrapy.Request
。
此外,您尝试通过1次调用请求多个网址,您必须将其循环
import scrapy
for link in urls:
yield scrapy.Request(url=link,meta={'item':item}, callback=self.parse_netsted_item)
那应该有用