我有下面的应用程序,它停止(代码如下)。我不明白为什么。我怀疑我可能错误地使用了Bottleneck模块。
免责声明:我正在尝试使用此项目学习编程和NodeJS。请帮忙。
简介
应用程序的目的是通过请求网页并将其解析为jQuery样式来获取数据库中文档中缺少的数据。然后将返回的数据保存到文档中的新键。该数据库包含约92 000份文件。该应用使用bottleneck,cheerio和request模块。我在OS X上运行应用程序。
问题
如果我将限制设置为请求数,例如
var limiter = new bottleneck(5, 0);
应用程序在第一批(本例中为5)后停止。但为什么?我怀疑Bottleneck可能有问题以及它希望我的程序如何工作。可能与每个Bottleneck "Gotchas"的回调有关吗?
如果我没有设置限制,应用程序就可以了。它获取网页并写入数据库。但是由于资源过载而导致很多错误,因此很慢。这就是我告诉瓶颈不要限制的方式:
var limiter = new bottleneck(0, 0);
这些是我得到的错误:
{ [Error: getaddrinfo ENOTFOUND www.vestnikverejnychzakazek.cz www.vestnikverejnychzakazek.cz:443]
code: 'ENOTFOUND',
errno: 'ENOTFOUND',
syscall: 'getaddrinfo',
hostname: 'www.vestnikverejnychzakazek.cz',
host: 'www.vestnikverejnychzakazek.cz',
port: 443 }
{ [Error: connect EMFILE 65.52.146.11:443 - Local (undefined:undefined)]
code: 'EMFILE',
errno: 'EMFILE',
syscall: 'connect',
address: '65.52.146.11',
port: 443 }
应用代码
'use strict';
var express = require('express');
var router = express.Router();
var assert = require('assert');
var mongo = require('mongoskin');
var path = require('path');
var ObjectID = require('mongodb').ObjectID;
var db = mongo.db("mongodb://localhost:27017/zak", {
native_parser: true
});
var database = db.collection("zakazky");
var cheerio = require("cheerio");
var request = require("request");
var fs = require("fs");
var toJs = (path.join(__dirname, '../public/javascripts', 'jquery.min.js'));
var jquery = fs.readFileSync(toJs).toString();
var bottleneck = require("bottleneck");
var limiter = new bottleneck(5, 0);
/* GET home page. */
router.get('/', function(req, res) {
var cursor = database.find();
cursor.each(function(err, data) {
assert.equal(err, null);
if (data != null) {
var vvz = "vestnikverejnychzakazek";
var praha = "zakazky.praha.eu";
var id = data["_id"];
var zdroj = data["zdroj"];
if (zdroj.indexOf(vvz) > -1) {
if ((data["cpv"] == null) || (data["predpokladana_hodnota"] == null)) {
limiter.submit(getCPV, id, zdroj, null);
// getCPV(id, zdroj);
} else {
// console.log("we're good");
return
}
} else if (zdroj.indexOf(praha) > -1) {
// console.log("pha");
}
} else {
// callback();
}
});
var getCPV = function(id, zdroj, callback) {
console.log("CPV started");
var zdroj = zdroj.replace("http://", "https://");
console.log("zdroj: " + zdroj);
var cpv = [];
var retryWrapper = function(retries) {
var retries; // I added this
if (retries === 3) {
return;
} else if (retries === undefined) {
retries = 0;
} else if (retries > 0) {
console.log("trying again");
}
request(zdroj, function(err, resp, data) {
if (err) {
console.log(err);
return retryWrapper(retries + 1);
}
var $ = cheerio.load(data);
var predpokladnaHodnota = $("[id*='Hodnota1_']").first().attr("value");
$("[id*='HlavniSlovnik']").each(function() {
cpv.push(this.attribs.value);
});
// let's check what we've got is actual data
if (cpv.length === 0) {
return
} else {
// send it off
writeCPV(id, "cpv", cpv)
}
if (predpokladnaHodnota == undefined || predpokladnaHodnota == null) {
return
} else {
// send it off
writeCPV(id, "predpokladana_hodnota", predpokladnaHodnota)
}
callback();
});
}; // end of retryWrapper
retryWrapper();
};
var writeCPV = function(id, key, value) {
id = ObjectID(id);
(function() {
console.log("starting DB write 1");
database.update({
"_id": id
}, {
$set: {
[key]: value
}
}, function(err, results) {
if (err) {
console.log("error in Mongo DB: \n------------------------\n" + err);
}
console.log("Mongo success!:\n ----------------------\n" + results);
// callback();
});
})();
};
// send the browser we're done
res.sendStatus(200);
});
// ---------------------
module.exports = router;
以下是来自数据库的示例文档,包括获取的密钥:
{
"_id": ObjectId("568d91396912101c1007ab4e"),
"cena": 1636363,
"cena_celkem": 1500000,
"cena_dopocitano": false,
"created": "2015-04-07T13:45:10.420739",
"datum_zadani": "2015-02-16",
"dodavatel": "/api/v1/dodavatel/381836/",
"druh_rizeni": "/api/v1/druh_rizeni/1116/",
"id": 1312587,
"modified": "2015-04-18T14:22:10.765733",
"nazev": "Pohostinství",
"pocet_nabidek": 2,
"podporeno_eu": true,
"popis": "Kurzy v oblasti pohostinství (formou profesní kvalifikace)",
"ramcova_smlouva": true,
"resource_uri": "/api/v1/zakazka/1312587/",
"skupina": "490648-ISVZUS_2011",
"typ_zakazky": "/api/v1/typ_zakazky/193/",
"zadavatel": "/api/v1/zadavatel/131528/",
"zdroj": "http://www.vestnikverejnychzakazek.cz/en/Form/Display/568547",
"zdroj_nazev": "isvzus.cz",
"cpv": ["80000000-4", "80400000-8", "", "", ""],
"predpokladana_hodnota": "1 500 000,00"
}
请求的示例网址:
http://www.vestnikverejnychzakazek.cz/en/Form/Display/568547
答案 0 :(得分:0)
这已经在这里了一段时间,但万一其他人偶然发现这有希望这将有助于某人!
限制器调用getCPV并且它在序列结束时调用回调,但是,retryWrapper中有一些条件语句允许提前返回,因此永远不会调用回调。限制器将被激活,直到它触发为止,所以始终确保在所有场景中都会触发回调。