class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//div[@id="site-list-content"]/div[@class="site-item "]'):
#for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('div[@class="title-and-desc"]//div[@class="site-title"]/text()').extract()
item['link'] = sel.xpath('div[@class="title-and-desc"]//a/@href').extract()
item['desc'] = sel.xpath('normalize-space(div[@class="title-and-desc"]//div[@class="site-descr "]/text())').extract()
yield item
这是我刮刀的代码。我跟着scrapy网站上的tutorial,但代码有点过时,所以我不得不自己更改代码。该代码适用于/ python / books网站,但不适用于/ resources网站。任何人都可以解释为什么会发生这种情况吗?谢谢。