我正在编写一个内容抓取工具,用于删除特定网站上的衬衫信息。我已经在Node中使用NPM包设置了所有内容来抓取并创建CSV文件。我遇到的问题是,众所周知,Node本质上是异步的。我试图编写的CSV文件是在我创建的JSON对象完成创建之前编写的(使用每个循环进行迭代来构建它),因此它传入json2csv(npm包)的'fields'参数。但它将我的数据作为空对象传递。任何人都可以告诉我如何告诉节点等到我的json对象生成之后再尝试使用fs.writefile创建CSV文件?谢谢
'use strict';
//require NPM packages
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var json2csv = require('json2csv');
//Array for shirts JSON object for json2csv to write.
var ShirtProps = [];
var homeURL = "http://www.shirts4mike.com/";
//start the scraper
scraper();
//Initial scrape of the shirts link from the home page
function scraper () {
//use the datafolderexists function to check if data is a directory
if (!DataFolderExists('data')) {
fs.mkdir('data');
}
//initial request of the home url + the shirts.php link
request(homeURL + "shirts.php", function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//scrape each of the links for its html data
$('ul.products li').each(function(i, element){
var ShirtURL = $(this).find('a').attr('href');
console.log(ShirtURL);
//pass in each shirtURL data to be scraped to add it to an object
ShirtHTMLScraper(ShirtURL);
});
FileWrite();
// end first request
} else {
console.error(error);
}
});
}
//create function to write the CSV file.
function FileWrite() {
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Time'];
var csv = json2csv({data: ShirtProps, fields: fields});
console.log(csv);
var d = new Date();
var month = d.getMonth()+1;
var day = d.getDate();
var output = d.getFullYear() + '-' +
((''+month).length<2 ? '0' : '') + month + '-' +
((''+day).length<2 ? '0' : '') + day;
fs.writeFile('./data/' + output + '.csv', csv, function (error) {
if (error) throw error;
});
}
//function to scrape each of the shirt links and create a shirtdata object for each.
function ShirtHTMLScraper(ShirtURL) {
request(homeURL + ShirtURL, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var time = new Date().toJSON().substring(0,19).replace('T',' ');
//json array for json2csv
var ShirtData = {
title: $('title').html(),
price: $(".price").html(),
imgURL: $('img').attr('src'),
url: homeURL + ShirtURL,
time: time.toString()
};
//push the shirt data scraped into the shirtprops array
ShirtProps.push(ShirtData);
console.log(ShirtProps);
// //set the feilds in order for the CSV file
// var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Time'];
// //use json2csv to write the file -
// var csv = json2csv({data: ShirtProps, fields: fields});
// console.log(csv);
// //date for the filesystem to save the scrape with today's date.
// var d = new Date();
// var month = d.getMonth()+1;
// var day = d.getDate();
// var output = d.getFullYear() + '-' +
// ((''+month).length<2 ? '0' : '') + month + '-' +
// ((''+day).length<2 ? '0' : '') + day;
// //use filesystem to write the file, or overrite if it exists.
// fs.writeFile('./data/' + output + '.csv', csv, function (error) {
// if (error) throw error;
// }); //end writeFile
} else {
console.error(error);
}
});
}
//Check if data folder exists, source: http://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
function DataFolderExists(folder) {
try {
// Query the entry
var DataFolder = fs.lstatSync(folder);
// Is it a directory?
if (DataFolder.isDirectory()) {
return true;
} else {
return false;
}
} //end try
catch (error) {
console.error(error);
}
}
答案 0 :(得分:3)
并不是说节点本质上是异步的,因为某些函数是异步的。在这种情况下,它是使用请求的异步调用。你在第二个请求调用(ShirtHTMLScraper内部的一个)开始后直接调用FileWrite。在填充ShirtProps之后,将调用FileWrite放在ShirtHTMLScraper的回调中。
编辑:仔细观察后,这也无济于事。问题是你在同步循环中调用异步函数。您可以通过创建一个计数器来实现这一点,该计数器在每个异步回调时递增,并检查您是否达到了要迭代的项目的长度。如果您正在进行最后一次迭代,请运行FileWrite。
更好的方法可能是查看Async库。你可以使用.each()来提供两个回调,一个在每次迭代时运行,一个在它们全部完成时运行。