尝试设置简单示例以按优先级顺序遍历请求。 我正在创建一个随机优先级顺序并将其附加到请求。 第二个函数get_link_title,输出名称以检查是否遵守优先级顺序。事实并非如此。无法想象我做错了什么:
from tutorial.items import TutorialItem
from scrapy.http import Request
import random
class DmozSpider(BaseSpider):
name = "dmoz"
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/"
]
def parse(self, response):
#filename = response.url.split('/')[-2]
#open(filename, 'wb').write(response.body)
tree = HtmlXPathSelector(response)
sites = tree.xpath('//ul[@class="directory-url"]/li')
items = []
reqs = []
for site in sites:
item = TutorialItem()
item['name'] = site.xpath('a/text()').extract()[0]
item['url'] = site.xpath('a/@href').extract()[0]
item['description'] = site.xpath('./text()').re('-\s([^\n]*?)\\n')[0]
items.append(item)
pty = random.randint(0, 100)
print 'From first page', item['name'], pty
reqs.append(Request(item['url'], meta={'item':item}, callback=self.get_link_title, priority=pty))
return reqs
def get_link_title(self, response):
item = response.meta['item']
print 'Title for', item['name']
tree = HtmlXPathSelector(response)
item['title_link'] = tree.xpath('//title/text()').extract()[0]
return item