scrapy不按优先级顺序处理请求

时间:2014-02-26 14:06:21

标签: python scrapy

尝试设置简单示例以按优先级顺序遍历请求。 我正在创建一个随机优先级顺序并将其附加到请求。 第二个函数get_link_title,输出名称以检查是否遵守优先级顺序。事实并非如此。无法想象我做错了什么:

from tutorial.items import TutorialItem
from scrapy.http import Request
import random

class DmozSpider(BaseSpider):
    name = "dmoz"
    start_urls = [
            "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/"
            ]

    def parse(self, response):
        #filename = response.url.split('/')[-2]
        #open(filename, 'wb').write(response.body)
        tree = HtmlXPathSelector(response)
        sites = tree.xpath('//ul[@class="directory-url"]/li')
        items = []
        reqs = []

        for site in sites:
            item = TutorialItem()
            item['name'] = site.xpath('a/text()').extract()[0]
            item['url'] = site.xpath('a/@href').extract()[0]
            item['description'] = site.xpath('./text()').re('-\s([^\n]*?)\\n')[0]
            items.append(item)
            pty = random.randint(0, 100)
            print 'From first page', item['name'], pty
            reqs.append(Request(item['url'], meta={'item':item}, callback=self.get_link_title, priority=pty))
        return reqs

    def get_link_title(self, response):
        item = response.meta['item']
        print 'Title for', item['name']
        tree = HtmlXPathSelector(response)
        item['title_link'] = tree.xpath('//title/text()').extract()[0]
        return item

0 个答案:

没有答案