我正在爬行以递归方式抓取网站,但问题是蜘蛛没有进入parse_item方法。我的蜘蛛的名字是example.py。代码如下:
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy.utils.response import get_base_url
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = [
"http://www.dmoz.org/Arts/"
]
print start_urls
rules = (
Rule(SgmlLinkExtractor(allow=('/Arts', )), callback='parse_item',follow=True),
)
#The spide is not entering into this parse_item
def parse_item(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title
答案 0 :(得分:0)
为什么要尝试明确定义和调用函数? 试试这个:
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = ["http://www.dmoz.org/Arts/"]
def parse(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title