我试图从网络中提取所有链接。我的蜘蛛是一个名为GeneralSpider的超类的子类。问题是当我通过parse(覆盖超类的方法)更改方法'parse_url'的名称时,链接提取器获取主页面的所有链接,但不遵循链接。如果我不更改方法名称,蜘蛛不起作用。我做错了吗?
# -*- coding: utf-8 -*-
from core.generalSpider import GeneralSpider
from scrapy.linkextractors import LinkExtractor
from scrapy import log
from scrapy.contrib.spiders import Rule
from scrapy.item import Item, Field
from spiders.settings import GET_ITEMS
class MyItem(Item):
url = Field()
text = Field()
item = Field()
class GetItemsSpider(GeneralSpider):
name = GET_ITEMS
start_urls = 'http://www.example.com'
allowed_domains = ['example.com']
rules = (Rule(LinkExtractor(allow=()), callback='parse_url', follow=True), )
def __init__(self, port, **kwargs):
super(GetItemsSpider, self).__init__(port, **kwargs)
# User agent
self.user_agent = Utils.get_random_item_from_list(core_settings.USER_AGENT_LIST)
# Scrapy logs
self.log('GetItemsSpider init start_urls= %s parameters= %s ' %
(self.start_urls, str(self.parameters)), level=log.DEBUG)
self.log('%s init start_urls= %s parameters= %s ' %
(self.name, self.start_urls, str(self.parameters)), level=log.INFO)
self.log('USER AGENT = %s' % self.user_agent, level=log.INFO)
self.log('PORT = %s' % self._proxy_port, level=log.INFO)
def parse_url(self, response):
items = []
self.log('GetItemsSpider parse start %s' % response.url, level=log.DEBUG)
for link in LinkExtractor().extract_links(response):
item = MyItem()
item['text'] = link.text
item['url'] = link.url
items.append(item)
return items
答案 0 :(得分:1)
没有更好的解释,文档上的那个,检查警告here
请勿覆盖parse
。
答案 1 :(得分:0)
最后我找不到为什么我的代码无效,但我找到了另一种解决方案:
def parse_url(self, response):
self.log('GetItemsSpider parse start %s' % response.url, level=log.DEBUG)
for link in LinkExtractor().extract_links(response):
item = MyItem()
item['text'] = link.text
item['url'] = link.url
if condition:
yield Request(urlparse.urljoin(response.url, link.url), callback=self.parse)
yield item
此解决方案基于Philip Adzanoukpe's example。我希望这可能有用。