我正在尝试使用以下代码废弃网站:
const cheerio = require('cheerio');
const jsonframe = require('jsonframe-cheerio');
const $ = cheerio.load('https://coinmarketcap.com/all/views/all/');
jsonframe($); // initializes the plugin
//exception handling
process.on('uncaughtException', err =>
console.error('uncaught exception: ', err))
process.on('unhandledRejection', (reason, p) =>
console.error('unhandled rejection: ', reason, p))
const frame = {
"crypto": {
"selector": "tbody > tr",
"data": [{
"name": "td:nth-child(2) > a:nth-child(3)",
"url": {
"selector": "td:nth-child(2) > a:nth-child(3)",
"attr": "href"
},
"marketcap": "tr > td:nth-child(4)",
"price": "tr > td:nth-child(5) > a:nth-child(1)",
}]
}
};
let companiesList = $('tbody').scrape(frame);
console.log(companiesList);
但是,在运行上面的示例代码时,我得到UnhandledPromiseRejectionWarning
:
(node:3890) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 1): TypeError: selector.includes is not a function
有什么建议我做错了吗?
感谢您的回复!
更新
我将代码更改为以下内容。但是,我只能废弃第一个元素。
为什么其他元素不会被废弃的任何建议?
const cheerio = require('cheerio')
const jsonframe = require('jsonframe-cheerio')
const got = require('got');
async function scrapCoinmarketCap() {
const url = 'https://coinmarketcap.com/all/views/all/'
const html = await got(url)
const $ = cheerio.load(html.body)
jsonframe($) // initializing the plugin
let frame = {
"Coin": "td.no-wrap.currency-name > a",
"url": "td.no-wrap.currency-name > a @ href",
"Symbol": "td.text-left.col-symbol",
"Price": "td:nth-child(5) > a",
}
console.log($('body').scrape(frame, {
string: true
}))
}
scrapCoinmarketCap()
答案 0 :(得分:5)
根据您更新的代码,您可以通过迭代每个tr
来抓取所有货币数据:
$('body tr').each(function() {
console.log($(this).scrape(frame, {
string: true
}))
})
但是,我认为最简洁的方法(正如我在another回答中所述)是使用 jsonframe-cheerio List/Array帧模式,这是完全有意的做到这一点:
let frame = {
currency: {
_s: "tr", // the selector
_d: [{ // allow you to get an array of data, not just the first item
"Coin": "td.no-wrap.currency-name > a",
"Url": "td.no-wrap.currency-name > a @ href",
"Symbol": "td.text-left.col-symbol",
"Price": "td:nth-child(5) > a"
}]
}
}
console.log($('body').scrape(frame, {
string: true
}))
答案 1 :(得分:-2)
方法cheerio.load()
不接受网址 - 它需要HTML作为字符串。
虽然我没有查看cheerio的源代码,但似乎该模块试图将URL解析为HTML文档,显然会失败并且会出现各种错误。
要解决此问题,您需要先将该网址的HTML内容加载到变量中,然后将该HTML内容传递给cheerio。
以下是使用got
加载页面的示例:
const got = require('got')
const cheerio = require('cheerio')
got('https://google.com')
.then(res => {
const $ = cheerio.load(res.body)
// Continue as usual
})
.catch(console.error)