具有Cheerio的选择器无法检索子代

时间:2018-11-24 11:10:31

标签: node.js web-scraping cheerio

我相信这是一个错误。 我正在尝试使用request和cheerio编写一个简单的网络抓取工具。

我如何解决它:

  1. 是的,我使用其他方法来定义选择器。
  2. 是的,我已经研究了其他stackoverflow问题。
  3. 是的,我在cheerio github上创建了一个问题,这里是链接:https://github.com/cheeriojs/cheerio/issues/1252
  4. 是的,我是一名专业的Web开发人员,这不是我第一次做node.js

更新: 在一些人指出之后,问题是cheerio解析并遍历了我的页面后,创建了所需的dom节点。 因此,我要求的页面部分根本不存在。 有什么想法可以绕开它吗?

我使用的版本:

{
  "name": "discont",
  "version": "1.0.0",
  "description": "Find when the item is on sale",
  "main": "index.js",
  "license": "MIT",
  "devDependencies": {
    "express": "^4.16.4"
  },
  "dependencies": {
    "cheerio": "^1.0.0-rc.2",
    "ejs": "^2.6.1",
    "request": "^2.88.0"
  }
}

这是我要抓取的HTML:

enter image description here

链接在这里: https://www.asos.com/new-look-wide-fit/new-look-wide-fit-court-shoe/prd/10675413?clr=oatmeal&SearchQuery=&cid=6461&gridcolumn=1&gridrow=9&gridsize=4&pge=1&pgesize=72&totalstyles=826

这是我的代码:

request(url, options, function(error, response, html) {
    if (!error) {
      var $ = cheerio.load(html, { withDomLvl1: false });
      // console.log("product-price", $("div.product-price")[0].attribs);
      console.log("product-price", $("div#product-price > div"));
    }
  });

console.log返回一个空数组(无法找到嵌套的div)。

这是我得到的回报:

initialize {
  options: 
   { withDomLvl1: false,
     normalizeWhitespace: false,
     xml: false,
     decodeEntities: true },
  _root: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] },
  length: 0,
  prevObject: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] } }

但是如果我将代码更改为

request(url, options, function(error, response, html) {
    if (!error) {
      var $ = cheerio.load(html, { withDomLvl1: false });
      // console.log("product-price", $("div.product-price")[0].attribs);
      console.log("product-price", $("div#product-price"));
    }
  });

我得到一个包含单个元素的数组:

initialize {
  '0': 
   { type: 'tag',
     name: 'div',
     namespace: 'http://www.w3.org/1999/xhtml',
     attribs: 
      { class: 'product-price',
        id: 'product-price',
        'data-bind': 'component: { name: "product-price", params: {state: state, showGermanVatMessage: false }}' },
     'x-attribsNamespace': { class: undefined, id: undefined, 'data-bind': undefined },
     'x-attribsPrefix': { class: undefined, id: undefined, 'data-bind': undefined },
     children: [],
     parent: 
      { type: 'tag',
        name: 'div',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: [Object],
        'x-attribsNamespace': [Object],
        'x-attribsPrefix': [Object],
        children: [Array],
        parent: [Object],
        prev: [Object],
        next: [Object] },
     prev: 
      { type: 'text',
        data: '\n    ',
        parent: [Object],
        prev: [Object],
        next: [Circular] },
     next: 
      { type: 'text',
        data: '\n    ',
        parent: [Object],
        prev: [Circular],
        next: [Object] } },
  options: 
   { withDomLvl1: false,
     normalizeWhitespace: false,
     xml: false,
     decodeEntities: true },
  _root: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] },
  length: 1,
  prevObject: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] } }

但是,我看不到元素的子元素(子元素数组为空),并且无法在对象上执行任何方法,例如find()text()

欢迎任何帮助!

1 个答案:

答案 0 :(得分:2)

Cheerio仅在发生诸如XHR之类的特殊事件之前才可以访问DOM。对于后js渲染的DOM,您将需要puppeteer或nightmarejs