我正在尝试从http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068中提取播放器正下方显示的文字。
我想要提取的文字是"更大的事情在前方:Dravid"。
我尝试使用curl
但我在curl
给我的输出中找不到它们。我该怎么做?
编辑: - 分析页面源,我想要提取的文本位于< {pageTitle}}中。 meta property =" og:title"含量=" {{PAGETITLE}}"取代。有什么方法可以提取{{pageTitle}}的价值吗?
答案 0 :(得分:1)
var casper = require('casper').create();
function getText() {
return $('.title-holder').find('h1').text();
}
casper.options.waitTimeout = 60000;
casper.waitForSelector('.title-holder h1');
casper.start('http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068');
casper.then(function () {
console.log(this.evaluate(getText));
});
casper.run();
答案 1 :(得分:1)
{* 3}}没有casper的变体。
var url = 'http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068'
var page = require('webpage').create();
page.open(url, function(status) {
var string = page.evaluate(function() {
return $('.title-holder').find('h1').text();
});
console.log(string);
phantom.exit();
});
用法:phantomjs request.js
输出:
Greater Things Lie Ahead: Dravid
答案 2 :(得分:0)
您可以尝试无头Chrome以获得更稳定的结果。我可以连续成功地获得10次中的10次。
设定:
安装Chrome
npm init --yes
npm install --save chrome-remote-interface
npm install --save chrome-launcher
然后将以下代码保存到foo.js,并运行:节点foo" http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068"
URL=process.argv[2]
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
const file = require('fs');
(async function() {
async function launchChrome() {
return await chromeLauncher.launch({
chromeFlags: [
'--disable-gpu',
'--headless'
]
});
}
const chrome = await launchChrome();
const protocol = await CDP({
port: chrome.port
});
const {
DOM,
Network,
Page,
Emulation,
Runtime
} = protocol;
await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
await Network.setUserAgentOverride({ userAgent });
//uncomment the following line for network traffic checking
//await Network.requestWillBeSent((params) => {console.log(params.request.url);});
await Page.navigate({
url: URL
});
Page.loadEventFired(async() => {
const js = "document.querySelector('.title-holder h1').textContent";
const result = await Runtime.evaluate({ expression: js });
console.log(result.result.value);
protocol.close();
chrome.kill();
});
})();