您好我已经安装了Scrapyjs + Splash并使用以下代码
import json
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spider import Spider
from scrapy.selector import Selector
import urlparse, random
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["whoscored.com"]
start_urls = ['http://www.whoscored.com/Regions/81/Tournaments/3/Seasons/4336/Stages/9192/Fixtures/Germany-Bundesliga-2014-2015']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
})
def parse(self, response):
cnt = 0
with open('links2.txt', 'a') as f:
while True:
try:
data = ''.join(Selector(text=response.body).xpath('//a[@class="match-link match-report rc"]/@href')[cnt].extract())
data = "https://www.whoscored.com"+data
except:
break
f.write(data+'\n')
cnt += 1
到目前为止它工作正常,但现在我想点击之前的'控制器中的按钮,它没有id也没有真正的href。
我试过了
splash:runjs("$('#date-controller').click()")
和
splash:runjs("window.location = document.getElementsByTagName('a')[64].href")
但都没有成功。
答案 0 :(得分:8)
以下是使用/execute
endpoint
# -*- coding: utf-8 -*-
import json
from six.moves.urllib.parse import urljoin
import scrapy
class WhoscoredspiderSpider(scrapy.Spider):
name = "whoscoredspider"
allowed_domains = ["whoscored.com"]
start_urls = (
'http://www.whoscored.com/Regions/81/Tournaments/3/Seasons/4336/Stages/9192/Fixtures/Germany-Bundesliga-2014-2015',
)
def start_requests(self):
script = """
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(1))
-- go back 1 month in time and wait a little (1 second)
assert(splash:runjs("$('#date-controller > a:first-child').click()"))
assert(splash:wait(1))
-- return result as a JSON object
return {
html = splash:html(),
-- we don't need screenshot or network activity
--png = splash:png(),
--har = splash:har(),
}
end
"""
for url in self.start_urls:
yield scrapy.Request(url, self.parse_result, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
def parse_result(self, response):
# fetch base URL because response url is the Splash endpoint
baseurl = response.meta["splash"]["args"]["url"]
# decode JSON response
splash_json = json.loads(response.body_as_unicode())
# and build a new selector from the response "html" key from that object
selector = scrapy.Selector(text=splash_json["html"], type="html")
# loop on the table row
for table in selector.css('table#tournament-fixture'):
# seperating on each date (<tr> elements with a <th>)
for cnt, header in enumerate(table.css('tr.rowgroupheader'), start=1):
self.logger.info("date: %s" % header.xpath('string()').extract_first())
# after each date, look for sibling <tr> elements
# that have only N preceding tr/th,
# N being the number of headers seen so far
for row in header.xpath('''
./following-sibling::tr[not(th/@colspan)]
[count(preceding-sibling::tr[th/@colspan])=%d]''' % cnt):
self.logger.info("record: %s" % row.xpath('string()').extract_first())
match_report_href = row.css('td > a.match-report::attr(href)').extract_first()
if match_report_href:
self.logger.info("match report: %s" % urljoin(baseurl, match_report_href))
示例日志:
$ scrapy crawl whoscoredspider
2016-03-07 19:21:38 [scrapy] INFO: Scrapy 1.0.5 started (bot: whoscored)
(...stripped...)
2016-03-07 19:21:38 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, SplashMiddleware, ChunkedTransferMiddleware, DownloaderStats
2016-03-07 19:21:38 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2016-03-07 19:21:38 [scrapy] INFO: Enabled item pipelines:
2016-03-07 19:21:38 [scrapy] INFO: Spider opened
2016-03-07 19:21:38 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-03-07 19:21:43 [scrapy] DEBUG: Crawled (200) <POST http://localhost:8050/execute> (referer: None)
2016-03-07 19:21:43 [whoscoredspider] INFO: date: Saturday, Apr 4 2015
2016-03-07 19:21:43 [whoscoredspider] INFO: record: 14:30FTWerder Bremen0 : 0Mainz 05Match Report2
2016-03-07 19:21:43 [whoscoredspider] INFO: match report: http://www.whoscored.com/Matches/834843/MatchReport
2016-03-07 19:21:43 [whoscoredspider] INFO: record: 14:30FTEintracht Frankfurt2 : 2Hannover 96Match Report1
2016-03-07 19:21:43 [whoscoredspider] INFO: match report: http://www.whoscored.com/Matches/834847/MatchReport
(...stripped...)
2016-03-07 19:21:43 [whoscoredspider] INFO: date: Sunday, Apr 26 2015
2016-03-07 19:21:43 [whoscoredspider] INFO: record: 14:30FT1Paderborn2 : 2Werder BremenMatch Report2
2016-03-07 19:21:43 [whoscoredspider] INFO: match report: http://www.whoscored.com/Matches/834837/MatchReport
2016-03-07 19:21:43 [whoscoredspider] INFO: record: 16:30FTBorussia M.Gladbach1 : 0WolfsburgMatch Report12
2016-03-07 19:21:43 [whoscoredspider] INFO: match report: http://www.whoscored.com/Matches/834809/MatchReport
2016-03-07 19:21:43 [scrapy] INFO: Closing spider (finished)
2016-03-07 19:21:43 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1015,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 143049,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 3, 7, 18, 21, 43, 662973),
'log_count/DEBUG': 2,
'log_count/INFO': 90,
'log_count/WARNING': 3,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'splash/execute/request_count': 1,
'splash/execute/response_count/200': 1,
'start_time': datetime.datetime(2016, 3, 7, 18, 21, 38, 772848)}
2016-03-07 19:21:43 [scrapy] INFO: Spider closed (finished)