这让我发疯了,因为我似乎太笨了,无法给抓取的正则表达式结构列表供爬虫在拒绝规则中使用。
我尝试过滤掉所有包含单词(部分)“ top”,“ kredit”和“ versicherung”的项目URL(并尝试添加更多URL部分作为过滤条件)。我尝试了以下结构:
rules = [Rule(LinkExtractor(deny=r'versicher',
r='top',
r'kredit',),
callback='parse_item', follow=True)]
但是,这会导致语法错误(单独使用deny = r'versicher')。
我没有在文档或在线示例中为虚拟初学者提供任何帮助,所以也许有人可以在这里帮助我?
这是蜘蛛:
import scrapy
import logging
from scrapy.loader import ItemLoader
from ..items import NorisbankItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.utils.log import configure_logging
class NorisbankSpider(CrawlSpider):
name = "nbtest"
allowed_domains = ['norisbank.de']
start_urls = ['https://www.norisbank.de']
custom_settings={ 'FEED_URI': "norisbank_%(time)s.json",
'FEED_FORMAT': 'json',
}
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='nbtest.log',
format='%(levelname)s: %(message)s',
level=logging.INFO
)
rules = [Rule(LinkExtractor(deny=[r'versicher',
r'kredit']),
callback='parse_norisbank', follow=True)],
def parse_norisbank(self, response):
page = response.url.split("/")[-2]
filename = 'nbtest-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
#Content Extraction
print(response.url)
l = ItemLoader(NorisbankItem(), response=response)
l.add_xpath('sitename', "//meta[@property='og:site_name']/@content")
l.add_xpath('siteurl', "//link[@rel='canonical']/@href")
l.add_xpath('dbCategory',"//meta[@name='dbCategory']/@content")
l.add_css('title','title::text')
l.add_xpath('descriptions',"normalize-space(//meta[@name='description']/@content)")
l.add_xpath('date',"//meta[@name='date']/@content")
l.add_xpath('version',"//meta[@name='v']/@content")
l.add_xpath('time',"//meta[@name='time']/@content")
l.add_xpath('sitecontent','//body//p//text()')
yield l.load_item()
all_pages = response.xpath('//a[contains(@href, "html")]/@href').getall()
for next_page in all_pages :
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
这是日志输出:
2020-07-17 13:58:30 [twisted] CRITICAL: Unhandled error in Deferred:
2020-07-17 13:58:30 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 192, in crawl
return self._crawl(crawler, *args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 196, in _crawl
d = crawler.crawl(*args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/twisted/internet/defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/twisted/internet/defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 86, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 98, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 143, in from_crawler
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/__init__.py", line 49, in from_crawler
spider = cls(*args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 79, in __init__
self._compile_rules()
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 139, in _compile_rules
self._rules[-1]._compile(self)
builtins.AttributeError: 'list' object has no attribute '_compile'
2020-07-17 13:58:30 [twisted] CRITICAL:
Traceback (most recent call last):
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 86, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 98, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 143, in from_crawler
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/__init__.py", line 49, in from_crawler
spider = cls(*args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 79, in __init__
self._compile_rules()
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 139, in _compile_rules
self._rules[-1]._compile(self)
AttributeError: 'list' object has no attribute '_compile'
2020-07-17 13:58:30 [twisted] CRITICAL:
Traceback (most recent call last):
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 86, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/crawler.py", line 98, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 143, in from_crawler
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/__init__.py", line 49, in from_crawler
spider = cls(*args, **kwargs)
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 79, in __init__
self._compile_rules()
File "/export/home/search/app/venv_p36_scrapy/lib/python3.6/site-packages/scrapy/spiders/crawl.py", line 139, in _compile_rules
self._rules[-1]._compile(self)
AttributeError: 'list' object has no attribute '_compile'
我已将代码放入回溯块中,但是未获得输出日志中的其他信息。由于我对此很陌生,也许您可以帮助我如何获取更多有用的信息?我已经将我的追溯块放在这里进行审查。我为自己的愚蠢问题感到非常抱歉,但是我是Python和Scrapy的绝对入门者。
import traceback
try:
.... spider ...
except Exception as ex:
traceback.print_exception(type(ex), ex, ex.__traceback__)