尝试编写一个网络爬虫,该爬虫遍历州列表,导航到每个州页,然后遍历每个州的县并从html表中提取一个值。 我想将该值添加到包含其他信息的数组中,并将其传递回调用模块。
我用以下代码解析县列表:
const express = require('express');
const fs = require('fs');
const request = require('request');
const cheerio = require('cheerio');
const app = express();
var counties = require('./livingWageCounties');
exports.processStateHTML = async function (inState, url){
var resultsToDisplay = [];
var newUrl;
var results = {};
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
//var results ={};
$('.counties').children('div').children('li').children('a').each(function(i, elm) {
var data = $(this);
var county = data.text().trim();
var newUrl = 'http://livingwage.mit.edu' + data.attr("href");
results = JSON.parse(counties.processCountyHTML(inState, county, newUrl));
console.log("results are: " + results);
//resultsToDisplay.push(results);
});
}
})
return resultsToDisplay;
};
此模块调用以下命令以获得我需要的值:
const express = require('express');
const fs = require('fs');
const request = require('request');
const cheerio = require('cheerio');
const app = express();
exports.processCountyHTML = function (instate, incounty, url){
var state, county, livingWageAmt;
var countyInfo = { state : "", county : "", livingWageAmt : ""};
var myJSON;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var row = $('.expenses_table').find("tr[class='odd results']").find("td:nth-child(8)").text().trim();
countyInfo.state = instate;
countyInfo.county = incounty;
countyInfo.livingWageAmt = row;
return countyInfo;
};
});
};
问题是,当我尝试从processCountyHTML获取结果时,它们显示为未定义。我认为这可能是一个异步问题,但我认为请求中的回调函数可以解决这一问题。还是我错过了异步/回调(对我来说是新的)重要的东西?