我正在使用Nightmare.js构建一个屏幕刮板,并使用Vo.js来控制流量。我有一个来自CSV的一系列邮政编码,循环通过邮政编码,进行搜索,跟踪搜索结果中的每个链接,并抓取该页面。然而,它似乎只是使用第一个邮政编码进行搜索,似乎早期退出循环。有任何想法吗?这是我的代码:
var Nightmare = require('nightmare'),
vo = require('vo'),
fs = require('fs'),
parse = require('csv-parse'),
csvWriter = require('csv-write-stream'),
nightmare = Nightmare(),
writer = csvWriter(),
path = process.argv[2]
var searchByPostcode = function*(postcode) {
var result = yield nightmare
.goto(URL)
.select('#body_umbBodyContent_BranchSearch_1_ddlRadius', 20)
.type('input[id=body_umbBodyContent_BranchSearch_1_txtLocation]', postcode)
.click('#body_umbBodyContent_BranchSearch_1_btnSearch')
.wait('.resultsarea .result')
.evaluate(function () {
var agents = []
var results = $('.result a, .alternate_result a')
urls = results.map(function(r) { return BASE_URL + $(this).attr('href') })
return urls
})
return result
}
var getDetail = function*(url) {
var result = yield nightmare
.goto(url)
.wait('.wizard')
.evaluate(function() {
var company = $("div.divlabel:contains('Company:')").next('div').find('a').attr('href')
var name = $('h1.tint').text().trim()
var address = $('#address_container div:nth-child(2)').text().trim()
var website = $("div.divlabel:contains('Website:')").next('div').find('a').attr('href')
var tel = $("div.divlabel:contains('Telephone:')").next('div').text().trim()
var email = $("div.divlabel:contains('E-Mail:')").next('div').find('a').text().trim()
return {
url: document.URL,
company: company,
name: name,
address: address,
website: website,
tel: tel,
email: email
}
})
return result
}
var run = function*() {
var agents = []
var postcodes = fs.readFileSync(path, 'utf8').split('\n')
for (var i = 0, len = postcodes.length; i < len; i++) {
console.log(postcodes[i])
var urls = yield searchByPostcode(postcodes[i])
console.log(urls)
for (var i = 0, len = urls.length; i < len; i++) {
var agent = yield getDetail(urls[i])
if (agent.email == '' & agent.company != '') {
company = yield getDetail('http://www.arla.co.uk/' + agent.company)
agent.website = agent.website == undefined ? company.website : agent.website
agent.email = agent.email == '' ? company.email : agent.email
agent.tel = agent.tel == '' ? company.tel : agent.tel
}
agents.push(agent)
}
}
yield nightmare.end();
return agents
}
vo(run)()
.then(function(agents) {
writer.pipe(fs.createWriteStream('results.csv'))
for (var i = 0, len = agents.length; i < len; i++) {
writer.write(agents[i])
}
writer.end()
})
.catch(e => console.error(e))
答案 0 :(得分:0)
i
,因此i
在第一次搜索后被设置为url计数,因此一旦解析了所有URL,就退出循环。 d&#39;!哦