我正在尝试重新编写使用requests-html库创建的代码。由于该项目需要额外的功能,因此我现在使用scrapy。
我很难让scrapy / splash蜘蛛获取xpath。每次我运行代码时,我一无所获。
使用request-html,xpaths返回所需的数据。
requests-html代码:
from requests_html import AsyncHTMLSession, HTMLSession
asession = AsyncHTMLSession()
async def get_page():
code = 'NASDAQ-MDB'
r = await asession.get(f'https://www.tradingview.com/symbols/{code}/')
await r.html.arender(wait=4)
return r
results = asession.run(get_page)
for result in results:
enterprise_value_sel = "(//span[@class='tv-widget-fundamentals__value apply-overflow-tooltip'])[2]"
total_shares_outstanding_sel = "(//span[@class='tv-widget-fundamentals__value apply-overflow-tooltip'])[4]"
enterprise_value = result.html.xpath(enterprise_value_sel, first=True).text
total_shares_outstanding = result.html.xpath(total_shares_outstanding_sel, first=True).text
print(enterprise_value, total_shares_outstanding)
scrapy_splash_code:
import scrapy
from scrapy_splash import SplashRequest
import json
from tradingview.items import *
import datetime
import os
class TradingviewsigsSpider(scrapy.Spider):
name ='tradingviewsigs'
script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(5.5))
local scroll_to =splash:jsfunc("window.scrollTo")
scroll_to(0, 800)
return {
html =splash:html(),
png =splash:png(),
har =splash:har(),
}
end
"""
start_urls =['https://tradingview.com/symbols/NASDAQ-MDB/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='execute',
args={'lua_source': self.script})
def parse(self, response):
url = response.url
print('Crawling: < {} >'.format(url))
financials = TradingviewItem()
financials['enterprise_val_sel'] = response.xpath("(//span[@class='tv-widget-fundamentals__value apply-overflow-tooltip'])[2]/text()").extract_first()
financials['total_shares_outstanding_sel'] = response.xpath("(//span[@class='tv-widget-fundamentals__value apply-overflow-tooltip'])[4]/text()").extract_first()
yield financials
我该怎么做才能使xpath与scrapy兼容?
答案 0 :(得分:1)
您需要相应地修复XPath(使用()
和[position]
选择所需的内容):
(//span[@class='tv-widget-fundamentals__value apply-overflow-tooltip'])[2]
(//span[@class='tv-widget-fundamentals__value apply-overflow-tooltip'])[4]
输出:9.334B-57.566M