我正在创建一个网络抓取工具,用于抓取该网站(https://www.imdb.com/movies-coming-soon/)明年发行的所有电影,并循环浏览包含下一个月每个月所有电影的一系列链接。一年,它的正常运行,但唯一的问题是由于node.js的异步行为,它没有按顺序返回它们,我如何使其遍历数组并按顺序返回数据?
我试图做一个回调函数,但我不知道它会去哪里
const request = require('request')
const cheerio = require('cheerio')
const movieArray = [ '/movies-coming-soon/2019-09/',
'/movies-coming-soon/2019-10/',
'/movies-coming-soon/2019-11/',
'/movies-coming-soon/2019-12/',
'/movies-coming-soon/2020-01/',
'/movies-coming-soon/2020-02/',
'/movies-coming-soon/2020-03/',
'/movies-coming-soon/2020-04/',
'/movies-coming-soon/2020-05/',
'/movies-coming-soon/2020-06/',
'/movies-coming-soon/2020-07/',
'/movies-coming-soon/2020-08/' ]
for (let i = 0; i < movieArray.length; i++) {
request.get('https://www.imdb.com' + movieArray[i] , (err, res, body) => {
if (!err && res.statusCode == 200) {
console.log(res.request.href)
const $ = cheerio.load(body)
//console.log(next)
$('h4').each((i, v) => {
const date = $(v).text()
console.log(date)
})
}
})
}
我期望它按顺序返回数据,而不是根据由于节点异步行为而返回数据的速度而按顺序返回
答案 0 :(得分:1)
这是https://lavrton.com/javascript-loops-how-to-handle-async-await-6252dd3c795/所述的for循环中的经典异步问题。解决方法如下:
// const request = require('request')
const request = require('request-promise');
const cheerio = require('cheerio');
const movieArray = [
'/movies-coming-soon/2019-09/',
'/movies-coming-soon/2019-10/',
'/movies-coming-soon/2019-11/',
'/movies-coming-soon/2019-12/',
'/movies-coming-soon/2020-01/',
'/movies-coming-soon/2020-02/',
'/movies-coming-soon/2020-03/',
'/movies-coming-soon/2020-04/',
'/movies-coming-soon/2020-05/',
'/movies-coming-soon/2020-06/',
'/movies-coming-soon/2020-07/',
'/movies-coming-soon/2020-08/',
];
async function processMovieArray(array) {
for (const item of array) {
await getMovie(item);
}
console.log('Done');
}
async function getMovie(item) {
const options = {
method: `GET`,
uri: 'https://www.imdb.com' + item,
};
const response = await request(options);
const $ = cheerio.load(response.body);
$('h4').each((i, v) => {
const date = $(v).text();
console.log(date);
});
}
processMovieArray(movieArray);
答案 1 :(得分:0)
技术含量最低的偏离当前代码的方法是仅使用for
循环的索引来填充数组。由于let
循环中的for
将为i
循环的每次迭代为for
创建一个单独的变量,因此我们可以在异步回调中使用该索引来引用所需的变量在结果数组中定位。然后,您还可以使用cntr
来了解何时完成所有结果:
const request = require('request');
const cheerio = require('cheerio');
if (!Array.prototype.flat) {
Array.prototype.flat = function() {
return this.reduce((acc, val) => acc.concat(val), []);
}
}
const movieArray = [ '/movies-coming-soon/2019-09/',
'/movies-coming-soon/2019-10/',
'/movies-coming-soon/2019-11/',
'/movies-coming-soon/2019-12/',
'/movies-coming-soon/2020-01/',
'/movies-coming-soon/2020-02/',
'/movies-coming-soon/2020-03/',
'/movies-coming-soon/2020-04/',
'/movies-coming-soon/2020-05/',
'/movies-coming-soon/2020-06/',
'/movies-coming-soon/2020-07/',
'/movies-coming-soon/2020-08/' ];
let results = [];
let cntr = 0;
for (let i = 0; i < movieArray.length; i++) {
request.get('https://www.imdb.com' + movieArray[i] , (err, res, body) => {
++cntr;
if (!err && res.statusCode == 200) {
console.log(res.request.href)
const $ = cheerio.load(body)
//console.log(next)
let textArray = [];
$('h4').each((i, v) => {
console.log(date)
textArray.push($(v).text());
});
results[i] = textArray;
}
if (cntr === moveArray.length) {
// all results are done now
let allResults = results.flat();
}
})
}
一种更优雅的方法是切换到Promise,并让Promise基础结构为您保留所有内容:
const rp = require('request-promise');
const cheerio = require('cheerio');
if (!Array.prototype.flat) {
Array.prototype.flat = function() {
return this.reduce((acc, val) => acc.concat(val), []);
}
}
const movieArray = [ '/movies-coming-soon/2019-09/',
'/movies-coming-soon/2019-10/',
'/movies-coming-soon/2019-11/',
'/movies-coming-soon/2019-12/',
'/movies-coming-soon/2020-01/',
'/movies-coming-soon/2020-02/',
'/movies-coming-soon/2020-03/',
'/movies-coming-soon/2020-04/',
'/movies-coming-soon/2020-05/',
'/movies-coming-soon/2020-06/',
'/movies-coming-soon/2020-07/',
'/movies-coming-soon/2020-08/' ];
//
if (!Array.prototype.flat) {
Array.prototype.flat = function() {
return this.reduce((acc, val) => acc.concat(val), []);
}
}
Promise.all(movieArray.map(path => {
return rp('https://www.imdb.com' + path).then(body => {
const $ = cheerio.load(body);
let textArray = [];
$('h4').each((i, v) => {
// console.log($(v).text());
textArray.push($(v).text());
});
return textArray;
}).catch(err => {
// ignore errors on urls that didn't work
// so we can get the rest of the results without aborting
console.log("err");
return undefined;
});
})).then(results => {
// flatten the two level array and remove empty items
let allResults = results.flat().filter(item => !!item);
console.log(allResults);
}).catch(err => {
console.log(err);
});
仅供参考,我在nodejs 10.16.0版中测试了第二版,并且可以正常工作。