我试图刮一个简单的页面(需要cheerio和请求): https://www.ishares.com/uk/individual/en/products/251824/
代码失败。我认为这是因为,为了达到上述目的,用户在前一页上被提示为“个人”或“机构”,因此被重定向。
我尝试了不同的网址变体,但都失败了。
如何使用node.js获取原始HTML?
这是代码:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio'); // fast flexible implement of jQuery for server.
var fs = require('fs');
var app = express();
var port = 8000;
var timeLog = []; // for dl to measure the time of events.
// var startTime = Date.now();
timeLog[0] = Date.now();
console.log('program initiated at time: '+new Date());
// example 1: pull the webpage and print to console
var url ="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf";
url = "https://www.ishares.com/uk/individual/en/products/251824/";
url="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
request(url,function functionName(err,resp,body) {
var $ = cheerio.load(body);
var distYield = $('.col-distYield');
var distYieldText = distYield.text();
console.log('we got to line 24');
console.log(distYieldText);
timeLog[2] = Date.now();
console.log('data capture time: '+(timeLog[2] - timeLog[0])/1000+' seconds');
if (err) {
console.log(err);
}else {
//console.log(body);
console.log('the body was written: success');
}
});
// example 2: download webpage and save file
var destination = fs.createWriteStream('./downloads/iSharesSEMB.html');
request(url)
.pipe(destination);
// example 3:
var destination = fs.createWriteStream('./downloads/iSharesSEMB2.html');
request(url)
.pipe(destination)
.on("finish",function () {
console.log('done');
})
.on('error',function (err) {
console.log(err);
});
timeLog[1] = Date.now();
console.log('program completed at time: '+new Date());
console.log('Asynchronous program run time: '+(timeLog[1] - timeLog[0])/1000+' seconds');
答案 0 :(得分:0)
好吧,我让它上班了。我为request
启用了Cookie支持,但后来进入了重定向循环。添加一个承诺解决了它。这里只是相关的HTML请求部分:
const request = require('request'),
cheerio = require('cheerio');
const url = "https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
options = {
jar: true
}
const getDistYield = url => {
return new Promise((resolve, reject) => {
request(url, options, function(err,resp,body) {
if (err) reject(err);
let $ = cheerio.load(body);
resolve($('.col-distYield'));
})
})
}
getDistYield(url)
.then((tag) => {
console.log(tag.text())
}).catch((e) => {
console.error(e)
})
输出:
分配收益率
分配收益率表示过去12个月的分配收入与基金当前净资产值的比率 截至2018年2月20日 4.82
另请注意,我已使用您提供的最后一个网址。
我希望这能为你解决问题:)
答案 1 :(得分:0)
修改了解决部分,只是获取了嵌套类的值(而不是文本)。
resolve($('.col-distYield > span:nth-child(2)'));