我正在尝试从此网页的每个条目中的“a”标记中获取文字
https://hn.algolia.com/?query=apple&sort=byPopularity&prefix&page=0&dateRange=all&type=story
我已经解析了一些网页,但我遇到了这个问题,这是我的代码。
var cheerio = require('cheerio');
var request = require('request');
request({
method: 'GET',
url: 'https://hn.algolia.com/?query=apple&sort=byPopularity&prefix&page=0&dateRange=all&type=story'
}, function(err, response, body) {
if (err) return console.error(err);
// Tell Cherrio to load the HTML
$ = cheerio.load(body);
// list = [];
// $('div[id="item-main"]').each(function(){
// var href = $(this).find('div > div').attr('h2');
// list.push(h2);
// });
$('item-title-and-infos').each(function() {
var href = $('h2', this).attr('href');
if (href.lastIndexOf('/') > 0) {
console.log($('a', this).text());
}
});
});
感谢。
答案 0 :(得分:0)
问题是内容是异步加载的,首先加载一个几乎空的页面,然后加载你要查找的内容。
只需在你的请求中控制主体。你会看到像这样的东西:
<!DOCTYPE html>
<html ng-app='HNSearch'>
<head ng-controller='HeadCtrl'>
...
links and meta
...
</head>
<body>
<div id='main' ng-cloak role='main' ui-view>
! NO CONTENT HERE ! IT WILL BE LOADED AFTER
</div>
<script src="https://d3nb9u6x572n0.cloudfront.net/assets/application-70dfa2f5ecb75bc8dfaa8729257bcbf1.js"></script>
</body>
</html>
如果您使用谷歌浏览器检查网络,您将看到此链接在以下后调用:
https://uj5wyc0l7x-dsn.algolia.net/1/indexes/Item_production/query?x-algolia-api-key=8ece23f8eb07cd25d40262a1764599b1&x-algolia-application-id=UJ5WYC0L7X&x-algolia-agent=Algolia%20for%20AngularJS%203.7.5
所以最后我发现这个解决方案可能会让你:
request.post({
url:'https://uj5wyc0l7x-dsn.algolia.net/1/indexes/Item_production/query?x-algolia-api-key=8ece23f8eb07cd25d40262a1764599b1&x-algolia-application-id=UJ5WYC0L7X&x-algolia-agent=Algolia%20for%20AngularJS%203.7.5',
body:'{"params":"query=apple&hitsPerPage=20&minWordSizefor1Typo=5&minWordSizefor2Typos=9&advancedSyntax=true&ignorePlurals=false&tagFilters=%5B%22story%22%5D&numericFilters=%5B%5D&page=0&queryType=prefixLast&typoTolerance=true&restrictSearchableAttributes=%5B%5D"}',
gzip:true,
headers:{
accept:'application/json',
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"es-ES,es;q=0.8",
"Cache-Control":"no-cache",
Connection:"keep-alive",
"Content-Length":258,
"content-type":"application/x-www-form-urlencoded",
Host:"uj5wyc0l7x-dsn.algolia.net",
Origin:"https://hn.algolia.com",
Pragma:"no-cache",
Referer:"https://hn.algolia.com/?query=apple&sort=byPopularity&prefix&page=0&dateRange=all&type=story",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
},
function (err,res,body) {
console.log(body);
});
现在正文是一个包含所有数据的巨大JSON文件。 我跳这会帮助你。