Nightmare.js和Vo.js - 早退出循环

时间:2016-10-13 09:19:50

标签: javascript nightmare

我正在使用Nightmare.js构建一个屏幕刮板,并使用Vo.js来控制流量。我有一个来自CSV的一系列邮政编码,循环通过邮政编码,进行搜索,跟踪搜索结果中的每个链接,并抓取该页面。然而,它似乎只是使用第一个邮政编码进行搜索,似乎早期退出循环。有任何想法吗?这是我的代码:

var Nightmare = require('nightmare'),
    vo = require('vo'),
    fs = require('fs'),
    parse = require('csv-parse'),
    csvWriter = require('csv-write-stream'),
    nightmare = Nightmare(),
    writer = csvWriter(),
    path = process.argv[2]

var searchByPostcode = function*(postcode) {
  var result = yield nightmare
    .goto(URL)
    .select('#body_umbBodyContent_BranchSearch_1_ddlRadius', 20)
    .type('input[id=body_umbBodyContent_BranchSearch_1_txtLocation]', postcode)
    .click('#body_umbBodyContent_BranchSearch_1_btnSearch')
    .wait('.resultsarea .result')
    .evaluate(function () {
      var agents = []
      var results = $('.result a, .alternate_result a')
      urls = results.map(function(r) { return BASE_URL + $(this).attr('href') })
      return urls
    })

  return result
}

var getDetail = function*(url) {
  var result = yield nightmare
    .goto(url)
    .wait('.wizard')
    .evaluate(function() {
      var company = $("div.divlabel:contains('Company:')").next('div').find('a').attr('href')
      var name = $('h1.tint').text().trim()
      var address = $('#address_container div:nth-child(2)').text().trim()
      var website = $("div.divlabel:contains('Website:')").next('div').find('a').attr('href')
      var tel = $("div.divlabel:contains('Telephone:')").next('div').text().trim()
      var email = $("div.divlabel:contains('E-Mail:')").next('div').find('a').text().trim()

      return {
        url: document.URL,
        company: company,
        name: name,
        address: address,
        website: website,
        tel: tel,
        email: email
      }
    })

  return result
}

var run = function*() {
  var agents = []
  var postcodes = fs.readFileSync(path, 'utf8').split('\n')

  for (var i = 0, len = postcodes.length; i < len; i++) {
    console.log(postcodes[i])
    var urls = yield searchByPostcode(postcodes[i])
    console.log(urls)

    for (var i = 0, len = urls.length; i < len; i++) {
      var agent = yield getDetail(urls[i])
      if (agent.email == '' & agent.company != '') {
        company = yield getDetail('http://www.arla.co.uk/' + agent.company)
        agent.website = agent.website == undefined ? company.website : agent.website
        agent.email = agent.email == '' ? company.email : agent.email
        agent.tel = agent.tel == '' ? company.tel : agent.tel
      }
      agents.push(agent)
    }

  }

  yield nightmare.end();

  return agents
}

vo(run)()
  .then(function(agents) {
    writer.pipe(fs.createWriteStream('results.csv'))
    for (var i = 0, len = agents.length; i < len; i++) {
      writer.write(agents[i])
    }
    writer.end()
  })
  .catch(e => console.error(e))

1 个答案:

答案 0 :(得分:0)

是的,事实证明我是个白痴。我在我的循环中将两个索引计数器都设置为i,因此i在第一次搜索后被设置为url计数,因此一旦解析了所有URL,就退出循环。 d&#39;!哦