第一次使用Node JS时,我正在做一些网页抓取来验证某个页面是否存在(404),或者它是否包含某个div。
在独立版本中,它工作得很完美,一次意味着一个网址,但现在我试图从谷歌电子表格中获取数据并循环遍历它并为每个网站发出请求。然后fs.appendFile将结果发送到文档。 (在另一个版本中,我更新了Google doc中的单元格)
以下是googlemaps.json中的输出(运行脚本后创建的文件):
{
"client": "Not working client map (404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "Not verified",
"status": "MAP DELETED (404) !"
}{
"client": "Not working client map(404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "NOT verified",
"status": "Somethings wrong, please verify."
}{
"client": "Not working client map(404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "Verified local business",
"status": "Map is verified !"
}{
"client": "Not working client map(404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "Verified local business",
"status": "Map is verified !"
}
所以我有:
电子表格,包含2列和4行。
google + page的客户端/网址。
这里非常基本的东西。我不明白为什么"验证"和"状态"似乎工作,但"客户"名称和URL都一样吗?它好像循环数字4 ...(我在电子表格中共有4行)....但我可以确认状态是正确的。
然而,当我在我发出请求的行之前运行一个console.log时,一切似乎都很好。
console.log(key + " -> " + rows[key][1] + " / " + rows[key][2]);
该行让我回到CLIENT / URL对所有4行都没有问题。
然后就在这一行之后:
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var json = { client : "", url : "", verified : "", status: ""};
一切都停留在第4行......
继承代码的其余部分:
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var Spreadsheet = require('edit-google-spreadsheet');
app.get('/maps', function(req, res){
Spreadsheet.load({
debug: true,
/*spreadsheetName: 'Google Maps Url List',
worksheetName: 'Sheet1',*/
spreadsheetId: 'MY ID',
worksheetId: 'od6',
// 1. Username and Password
username: 'USERNAME',
password: 'PASS',
}, function sheetReady(err, spreadsheet) {
if(err) throw err;
spreadsheet.receive(function(err, rows, info) {
if(err) throw err;
console.log(rows);
//console.log(rows["1"]["2"]);
for (var key in rows) {
if (rows.hasOwnProperty(key)) {
//key++;
console.log(key + " -> " + rows[key][1] + " / " + rows[key][2])
var url = rows[key][2];
var clientName = rows[key][1];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var json = { client : "", url : "", verified : "", status: ""};
//verify if the google + page is verified local business
if ($('.xR.OA.Hlb.d-k-l.NA').length){
$('.xR.OA.Hlb.d-k-l.NA').filter(function(){
var data = $(this);
var isValid = data.text();
json.client = clientName;
json.url = url;
json.verified = isValid;
json.status = "Map is verified !";
})
} else {
// if not, verify if its the google 404 page
if ($('#af-error-container').length){
json.client = clientName;
json.url = url;
json.verified = "Not verified";
json.status = "MAP DELETED (404) !";
} else { // if not, then the map is there but is not verified anymore
json.client = clientName;
json.url = url;
json.verified = "NOT verified";
json.status = "Somethings wrong, please verify.";
}
} //endif
} //end of if error
fs.appendFile('googleMaps.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the googleMaps.json file');
})
})
} //endif rowhasproperty
} //end for loop
res.send("check the generated file");
}); //end of spreadsheet receive
}); //end of sheetReady
})
app.listen('8081')
console.log('Google Magic happens on port 8081');
exports = module.exports = app;
我希望有人理解,我想知道是否存在范围/全局变量问题...我非常谨慎地玩了几个小时,并且毫无头绪。
这是电子表格。
答案 0 :(得分:0)
你遇到了关闭问题。
您的request()
是异步的。运行后,返回结果需要一些时间。但是,Node不会阻止等待它返回 - 它将很乐意继续循环的下一次迭代。
当循环继续时,会更改url
和clientName
的值:
var url = rows[key][2];
var clientName = rows[key][1];
在将来的某个时刻,request()
将完成并调用其回调。那时,它将使用url
和clientName
的值,因为它们当前已定义。因此,匿名函数的每个实例都将使用循环中的最后一个值执行。
有关详细信息,请查看this SO answer。
最简单的解决方案之一是将您需要的任何变量传递给为每次循环迭代重新定义的新闭包。
(function(url, clientName) {
request(url, function(error, response, html){
...
});
)(url, clientName);
现在,url
和clientName
是本地范围的特定循环迭代,并将保持其正确的值。
答案 1 :(得分:0)
啊js范围的乐趣。
请记住,范围是在功能级别。因此,要将所有请求处理程序保持在正确的范围内,您应该在请求处理程序唯一的作用域中声明任何变量。我建议将请求拉出到函数中并传入url和clientname
function checkUrl(url, clientName) {
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var json = { client : "", url : "", verified : "", status: ""};
//verify if the google + page is verified local business
if ($('.xR.OA.Hlb.d-k-l.NA').length){
$('.xR.OA.Hlb.d-k-l.NA').filter(function(){
var data = $(this);
var isValid = data.text();
json.client = clientName;
json.url = url;
json.verified = isValid;
json.status = "Map is verified !";
})
} else {
// if not, verify if its the google 404 page
if ($('#af-error-container').length){
json.client = clientName;
json.url = url;
json.verified = "Not verified";
json.status = "MAP DELETED (404) !";
} else { // if not, then the map is there but is not verified anymore
json.client = clientName;
json.url = url;
json.verified = "NOT verified";
json.status = "Somethings wrong, please verify.";
}
} //endif
} //end of if error
fs.appendFile('googleMaps.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the googleMaps.json file');
})
})
}
然后调用您在当前代码中分配url和clientname的函数。
checkUrl(rows[key][2], rows[key][1]);