NodeJS Xray无法抓取到多个站点来抓取数据

时间:2016-07-08 07:01:45

标签: javascript node.js web-scraping x-ray

所以,我需要将房地产广告刮到nidax.json文件中。我转到所有广告页面,并使用指向各个广告的链接来获取我需要的数据。我正在使用NodeJS Xray scraper,但由于某种原因它不起作用。

有时候它什么都不返回,有时它只返回指向各个广告的链接。

var Xray = require('x-ray');
var x= Xray();
x('http://nidax-nekretnine.rs/nekretnine/','div.kutija-veca_dno > div.read-more` span ',[{
    url: 'a@href'
    items: x('div.kutija-veca_dno > div.read-more > span > a@href', {
    location: 'body > div.contentarea-novo > div > div.info-part > div.one-third  div.osnovni-podaci > p:nth-child(2) > span.orange-text',
}), // follow link to google images
}]).write('nidax.json');

1 个答案:

答案 0 :(得分:0)

您可以订阅以下pull request被批准时获取。

同时我建议您在下载的X射线模块中应用该解决方案。这是一行代码,我在两个项目中测试,它只是工作。看看第237行的index.js文件,在长评论后看到“返回”:

function WalkHTML (xray, selector, scope, filters) {
  return function walkHTML ($, fn) {
    walk(selector, function (v, k, next) {
      if (typeof v === 'string') {
        var value = resolve($, root(scope), v, filters)
        return next(null, value)
      } else if (typeof v === 'function') {
        return v($, function (err, obj) {
          if (err) return next(err)
          return next(null, obj)
        })
      } else if (isArray(v)) {
        if (typeof v[0] === 'string') {
          return next(null, resolve($, root(scope), v, filters))
        } else if (typeof v[0] === 'object') {
          var $scope = $.find ? $.find(scope) : $(scope)
          var pending = $scope.length
          var out = []

          // Handle the empty result set (thanks @jenbennings!)
          if (!pending) return next(null, out)

          $scope.each(function (i, el) {
            var $innerscope = $scope.eq(i)
            var node = xray(scope, v[0])
            node($innerscope, function (err, obj) {
              if (err) return next(err)
              out[i] = obj
              if (!--pending) {
                return next(null, compact(out))
              }
            })
          })
          // Nested crawling broken on 'master'. When to merge 'bugfix/nested-crawling' #111, Needed to exit this without calling next, the problem was that it returned to the "finished" callback before it had retrived all pending request. it should wait for "return next(null, compact(out))"
          return
        }
      }
      return next()
    }, function (err, obj) {
      if (err) return fn(err)
      fn(null, obj, $)
    })
  }
}