如何在bash中从HTML文档中提取特定文本?

时间:2018-02-05 11:30:51

标签: html bash

我正在尝试从http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068中提取播放器正下方显示的文字。

我想要提取的文字是"更大的事情在前方:Dravid"。

我尝试使用curl但我在curl给我的输出中找不到它们。我该怎么做?

编辑: - 分析页面源,我想要提取的文本位于< {pageTitle}}中。 meta property =" og:title"含量=" {{PAGETITLE}}"取代。有什么方法可以提取{{pageTitle}}的价值吗?

3 个答案:

答案 0 :(得分:1)

var casper = require('casper').create();

function getText() {
    return $('.title-holder').find('h1').text();
}

casper.options.waitTimeout = 60000;
casper.waitForSelector('.title-holder h1');
casper.start('http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068');

casper.then(function () {
    console.log(this.evaluate(getText));
});

casper.run();

答案 1 :(得分:1)

{* 3}}没有casper的变体。

var url = 'http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068'
var page = require('webpage').create();

page.open(url, function(status) {
  var string = page.evaluate(function() {
    return $('.title-holder').find('h1').text();
  });
  console.log(string);
  phantom.exit();
});

用法:phantomjs request.js

输出:

Greater Things Lie Ahead: Dravid

答案 2 :(得分:0)

您可以尝试无头Chrome以获得更稳定的结果。我可以连续成功地获得10次中的10次。

设定:
安装Chrome

  

npm init --yes
  npm install --save chrome-remote-interface
  npm install --save chrome-launcher
  
  然后将以下代码保存到foo.js,并运行:

     

节点foo" http://www.hotstar.com/sports/cricket/series/m184177/match-clips/greater-things-lie-ahead-dravid/2002003068"

URL=process.argv[2]
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
const file = require('fs');

(async function() {
    async function launchChrome() {
        return await chromeLauncher.launch({
            chromeFlags: [
                '--disable-gpu',
                '--headless'
            ]
        });
    }
    const chrome = await launchChrome();
    const protocol = await CDP({
        port: chrome.port
    });

    const {
        DOM,
        Network,
        Page,
        Emulation,
        Runtime
    } = protocol;
    await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
    await Network.setUserAgentOverride({ userAgent });

    //uncomment the following line for network traffic checking
    //await Network.requestWillBeSent((params) => {console.log(params.request.url);});

    await Page.navigate({
        url: URL
    });

    Page.loadEventFired(async() => {
        const js = "document.querySelector('.title-holder h1').textContent";
        const result = await Runtime.evaluate({ expression: js });

        console.log(result.result.value);

        protocol.close();
        chrome.kill();
    });

})();