我使用scrapy + splash来使Splash单击以下网站上的按钮:https://www.daraz.pk/smartphones/,然后返回新呈现的页面的html。
代码如下
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
class DarazsSpider(scrapy.Spider):
name = 'darazs'
custom_settings = {
'DOWNLOAD_TIMEOUT': 60000,
'DOWNLOAD_MAXSIZE': 0,
'DOWNLOAD_WARNSIZE': 0
}
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(0.5))
treat=require('treat')
result = {}
for i=2,68,1
do
assert(splash:runjs('document.querySelector(".ant-pagination-next > a").click()'))
assert(splash:wait(5.0))
result[i]=splash:html()
end
return treat.as_array(result)
end
'''
def start_requests(self):
url = 'https://www.daraz.pk/smartphones/'
yield SplashRequest(url=url, callback=self.parse, endpoint='render.html', args={'wait': 0.5})
yield SplashRequest(url=url, callback=self.parse_other_pages, endpoint='execute',
args={'wait': 0.5, 'lua_source': self.script, 'timeout': 3600}, dont_filter=True)
def parse(self, response):
for phone in response.xpath('//div[@class="c2prKC"]'):
yield {
'Name': phone.xpath('.//div[@class="c16H9d"]/a/text()').extract(),
'Price': phone.xpath('.//span[@class="c13VH6"]/text()').extract(),
'old_price': phone.xpath('.//del[@class="c13VH6"]/text()').extract(),
}
def parse_other_pages(self, response):
for page in response.data:
sel = Selector(text=page, type=None, root=None)
for phone in sel.xpath('//div[@class="c2prKC"]'):
yield {
'Name': phone.xpath('.//div[@class="c16H9d"]/a/text()').extract(),
'Price': phone.xpath('.//span[@class="c13VH6"]/text()').extract(),
'old_price': phone.xpath('.//del[@class="c13VH6"]/text()').extract(),
}
它可以正确运行第一个解析函数并从第一页中读取数据,但不会抓取其他页面,并且会出现以下错误:
Traceback (most recent call last):
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy_splash/middleware.py", line 156, in process_spider_output
for el in result:
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/ibtsam/PycharmProjects/dsf/daraz/daraz/spiders/darazs.py", line 50, in parse_other_pages
sel = Selector(text=page, type=None, root=None)
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/scrapy/selector/unified.py", line 84, in __init__
super(Selector, self).__init__(text=text, type=st, root=root, **kwargs)
File "/home/ibtsam/PycharmProjects/dsf/venv/lib/python3.6/site-packages/parsel/selector.py", line 195, in __init__
raise ValueError("Selector needs either text or root argument")
ValueError: Selector needs either text or root argument
我认为lua脚本中有错误。任何帮助将不胜感激。