我正在用Scrapy写一个小爬虫。我希望能够将start_url
参数传递给我的蜘蛛,后者将使我能够通过Celery(或其他东西)运行它。
我通过争论打了一堵墙。而且我收到了一个错误:
2016-03-13 08:50:50 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsole, LogStats, CoreStats, SpiderState
Unhandled error in Deferred:
2016-03-13 08:50:50 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/local/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 57, in run
self.crawler_process.crawl(spname, **opts.spargs)
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 153, in crawl
d = crawler.crawl(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 1274, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 1128, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 70, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 80, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiders/crawl.py", line 91, in from_crawler
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiders/__init__.py", line 50, in from_crawler
spider = cls(*args, **kwargs)
exceptions.TypeError: __init__() takes at least 3 arguments (1 given)
2016-03-13 08:50:50 [twisted] CRITICAL:
蜘蛛代码如下:
Class OnetSpider(CrawlSpider):
name = 'OnetSpider'
def __init__(self, ur, *args, **kwargs):
super(OnetSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('start_url')]
#allowed_domains = ['katalog.onet.pl']
#start_urls = ['http://katalog.onet.pl/']
response_url = ""
rules = [Rule(LinkExtractor(unique = True),
callback="parse_items",
follow = True)]
def parse_start_url(self, response):
self.response_url = response.url
return self.parse_items(response)
def parse_items (self, response):
baseDomain = self.get_base_domain(self.response_url)
for sel in response.xpath('//a'):
l = sel.xpath('@href').extract()[0]
t = sel.xpath('text()').extract()
if (self.is_relative(l)) or (baseDomain.upper()
in l.upper()):
continue
else:
itm = OnetItem()
itm['anchorTitle'] = t
itm['link'] = self.process_url(l)
itm['timeStamp'] = datetime.datetime.now()
itm['isChecked'] = 0
itm['responseCode'] = 0
itm['redirecrURL'] = ''
yield itm
def is_relative(self,url):
#checks if url is relative path or absolute
if urlparse(url).netloc =="":
return True
else:
return False
def get_base_domain(self, url):
#returns base url stripped from www/ftp and any ports
base = urlparse(url).netloc
if base.upper().startswith("WWW."):
base = base[4:]
if base.upper().startswith("FTP."):
base = base[4:]
base = base.split(':')[0]
return base
def process_url(self,url):
u = urlparse(url)
if u.scheme == '' :
u.scheme = 'http'
finalURL = u.scheme + '://' + u.netloc +'/'
return finalURL.lower()
我很确定它与传递参数有关,因为没有def __init__
蜘蛛运行良好。
知道问题是什么吗?
我在我的VPS Ubuntu服务器上运行它。
答案 0 :(得分:2)
所以我设法让Crawler工作。不知道是什么修复它 - 只是采用了原始蜘蛛并从Scrapy source file复制了def __init__
部分。
以下是工作版本。仅供历史参考。我测试了其中一个scrapinghub's examples并且它正在工作 - 这让我觉得我的蜘蛛可能有一些小错误,最后我可能会重新编写它。
无论如何 - 工作样本:
from onet.items import OnetItem
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
#from scrapy import log
import logging
from urlparse import urlparse
import datetime
logger = logging.getLogger('-------MICHAL------')
class WebSpider(CrawlSpider):
name = 'WebSpider'
def __init__(self, *a, **kw):
super(WebSpider, self).__init__(*a, **kw)
self._compile_rules()
url = kw.get('url') or kw.get('domain')
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://%s/' % url
self.url = url
self.start_urls = [self.url]
self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
response_url = ""
rules = [Rule(LinkExtractor(unique = True),
callback="parse_items",
follow = True)]
def parse_start_url(self, response):
self.response_url = response.url
return self.parse_items(response)
def parse_items (self, response):
baseDomain = self.get_base_domain(self.response_url)
for sel in response.xpath('//a'):
l = sel.xpath('@href').extract()[0]
t = sel.xpath('text()').extract()
if (self.is_relative(l)) or (baseDomain.upper()
in l.upper()):
continue
else:
itm = OnetItem()
itm['anchorTitle'] = t
itm['link'] = self.process_url(l)
itm['timeStamp'] = datetime.datetime.now()
itm['isChecked'] = 0
itm['responseCode'] = 0
itm['redirecrURL'] = ''
yield itm
def is_relative(self,url):
#checks if url is relative path or absolute
if urlparse(url).netloc =="":
return True
else:
return False