我正在尝试创建一个web scraper(在node.js中),它将从站点中提取信息,并将其写入文件。我已经构建了正确地为一个页面工作,但是当我尝试在for循环中使用该函数时,为了遍历多个游戏,我在所有游戏中都得到了错误的数据。
我知道这与Javascript的异步性质有关,我已经阅读了有关回调函数的内容,但我不确定我是否理解如何将其应用于我的代码。任何帮助都将非常感激:
for(x = 4648; x < 4650; x++){ //iterate over a few gameIDs, used in URL for request
scrapeGame(x);
}
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
基本上,我要做的是,在for循环中,告诉它等待完成scrapeGame(x)
函数,然后递增x并运行它以进行下一个游戏 - 否则,数组开始覆盖彼此和数据变得一团糟。
编辑:我现在已经包含了我试图运行的完整代码!我在编写文件后查看文件时遇到错误。例如,第一个文件是8kb,第二个是〜16,第三个是~32,等等。在运行下一个游戏之前似乎没有被清除。
该计划的想法是从存档网站提取Jeopardy问题/答案,以便最终为自己构建一个测验应用程序。
//Iterate over arbitrary number of games, scrape each
for(x = 4648; x < 4650; x++){
scrapeGame(x, function(scrapeResult) {
if(scrapeResult){
console.log('Scrape Successful');
} else {
console.log('Scrape ERROR');
}
});
}
function scrapeGame(gameId, callback){
var request = require('request');
cheerio = require('cheerio');
fs = require('fs');
categories = [];
categorylist = [];
ids = [];
clues = [];
values = ['0','$200','$400','$600','$800','$1000','$400','$800','$1200','$1600','$2000'];
valuelist = [];
answers = [];
array = [];
file = [];
status = false;
var showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId;
var showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function(){
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function(){
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1){
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1){
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function(){
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
$('.correct_response').each(function(){
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for(var i = 0; i < array[0].length; i++){
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId +'.txt', stringFile, function(err) {
if(err) {
console.log(err);
} else {
console.log("Game #" + gameId + " has been scraped.");
status = true;
}
});
}
});
}
});
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
//feed callback status
callback(status);
}
答案 0 :(得分:3)
// Iterate over a few gameIDs, used in URL for request.
for (x = 4648; x < 4650; x++) {
// Pass in the callback as an anonymous function.
// So below I am passing in the id and the function I want to execute.
// AND, defining the results I am expecting as passed in arguments.
scrapeGame(x, function(scrapeResult, err) {
// This will *NOT* execute *UNTIL* you call it in the function below.
// That means that the for loop's execution is halted.
// This function receives the status that is passed in,
// in this case, a boolean true/false and an error if any.
if (scrapeResult) {
// Scrape was true, nothing to do.
// The for loop will now move on to the next iteration.
console.log('Scrape Successful');
} else {
// Scrape was false, output error to console.log and
// break loop to handle error.
console.log('Scrape ERROR :: ' + err);
// Notice we are calling break while in the
// scope of the callback function
// Remove the break if you want to just move onto
// the next game ID and not stop the loop
break;
}
});
}
// This function now accepts two arguments.
function scrapeGame(gameId, callback) {
// ************************************************
// ** Do Your Work Here **
// Request from URL, scrape HTML to arrays as necessary.
// Write final array to file.
// After file creation, execute the callback and pass bool
// status (true/false).
// ************************************************
var request = require('request'),
cheerio = require('cheerio'),
fs = require('fs'),
categories = [],
categorylist = [],
ids = [],
clues = [],
values = [
'0',
'$200',
'$400',
'$600',
'$800',
'$1000',
'$400',
'$800',
'$1200',
'$1600',
'$2000'
],
valuelist = [],
answers = [],
array = [],
file = [],
showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId,
showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function() {
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function() {
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1) {
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1) {
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function() {
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
$('.correct_response').each(function() {
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for (var i = 0; i < array[0].length; i++) {
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId + '.txt', stringFile, function(err) {
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
if (err) {
// ******************************************
// Callback false with error.
callback(false, err);
// ******************************************
} else {
console.log("Game #" + gameId + " has been scraped.");
// ******************************************
// Callback true with no error.
callback(true);
// ******************************************
}
});
}
});
}
});
}
答案 1 :(得分:1)
我的假设是你希望它们一个接一个地被刮掉,而不是并行。所以,对于循环没有帮助。以下方法应该可以解决问题:
var x = 4648;
var myFunc = scrapeGame(x, function cb(){
if(x >= 4650){
return;
}
x++;
return myFunc(x, cb);
});
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
对于嵌套异步函数,您希望它们以串行方式执行,您应该忘记for循环。
使用http客户端正确处理请求的示例:
function scrapeGame(gameId, cb){
//your code and set options
http.request(options, function(response){
var result = "";
response.on('data', function (chunk) {
result += chunk;
});
response.on('end',function(){
//write data here;
//do the callback
cb();
});
});
}
答案 2 :(得分:0)
我解决了我所看到的问题的ROOT原因,虽然我确实相信没有上面红色的回调协助,但我会一直迷失。
原来数据处理正确,但文件写入正在加扰。事实证明,有一种不同的方法可以调用而不是writeFile或appendFile:
fs.appendFileSync();
调用同步版本处理了写入文件的命令,这些文件已经附加到文件中,而不仅仅是为了它。除了上面的回调帮助之外,这还解决了这个问题。
感谢大家的帮助!