我相信这是一个错误。 我正在尝试使用request和cheerio编写一个简单的网络抓取工具。
我如何解决它:
更新: 在一些人指出之后,问题是cheerio解析并遍历了我的页面后,创建了所需的dom节点。 因此,我要求的页面部分根本不存在。 有什么想法可以绕开它吗?
我使用的版本:
{
"name": "discont",
"version": "1.0.0",
"description": "Find when the item is on sale",
"main": "index.js",
"license": "MIT",
"devDependencies": {
"express": "^4.16.4"
},
"dependencies": {
"cheerio": "^1.0.0-rc.2",
"ejs": "^2.6.1",
"request": "^2.88.0"
}
}
这是我要抓取的HTML:
这是我的代码:
request(url, options, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html, { withDomLvl1: false });
// console.log("product-price", $("div.product-price")[0].attribs);
console.log("product-price", $("div#product-price > div"));
}
});
console.log返回一个空数组(无法找到嵌套的div)。
这是我得到的回报:
initialize {
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 0,
prevObject:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] } }
但是如果我将代码更改为
request(url, options, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html, { withDomLvl1: false });
// console.log("product-price", $("div.product-price")[0].attribs);
console.log("product-price", $("div#product-price"));
}
});
我得到一个包含单个元素的数组:
initialize {
'0':
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs:
{ class: 'product-price',
id: 'product-price',
'data-bind': 'component: { name: "product-price", params: {state: state, showGermanVatMessage: false }}' },
'x-attribsNamespace': { class: undefined, id: undefined, 'data-bind': undefined },
'x-attribsPrefix': { class: undefined, id: undefined, 'data-bind': undefined },
children: [],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object],
'x-attribsNamespace': [Object],
'x-attribsPrefix': [Object],
children: [Array],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 1,
prevObject:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] } }
但是,我看不到元素的子元素(子元素数组为空),并且无法在对象上执行任何方法,例如find()
或text()
>
欢迎任何帮助!
答案 0 :(得分:2)
Cheerio仅在发生诸如XHR之类的特殊事件之前才可以访问DOM。对于后js渲染的DOM,您将需要puppeteer或nightmarejs