我在这里使用node.js请求和request-promise,cheerio进行抓取。
我的代码:
const request = require("request");
const cheerio = require("cheerio");
const rp = require("request-promise");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
const scrapeResults = [];
async function scrapeJobHeader() {
try {
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const resultTitle = $(element).children(".result-title");
title = resultTitle.text();
link = resultTitle.attr("href");
const datePosted = $(element).children("time").attr("datetime");
const scrapResult = {title, link, datePosted};
scrapeResults.push(scrapResult);
return scrapeResults;
});
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(jobWithHeaders) {
return await Promise.all(
jobWithHeaders.map(async job => {
const htmResult = await rp.get(job.url);
const $ = await cheerio.load(htmResult);
$(".print-qrcode-container").remove();
job.description = $("#postingbody").text();
})
);
}
async function scrapeCraigslist() {
const jobWithHeaders = await scrapeJobHeader();
const jobsFullData = await scrapeDescription();
console.log(jobFullData);
}
scrapeCraigslist();
当我运行代码时,出现如下错误:
C:\Users\Ahmed-PC\craigslist>node index.js
(node:19808) UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'map' of undefined
at scrapeDescription (C:\Users\Ahmed-PC\craigslist\index.js:42:24)
at scrapeCraigslist (C:\Users\Ahmed-PC\craigslist\index.js:62:32)
如何解决此错误以及我在这里做什么错?
答案 0 :(得分:0)
您正在执行await scrapeDescription();
,但是如果不将其传递给数组,就无法调用该函数。
这样做时,您的参数jobWithheaders
为undefined
,然后尝试执行undefined.map()
,这会给您看到的错误。
似乎您只需要更改此内容即可:
async function scrapeCraigslist() {
const jobWithHeaders = await scrapeJobHeader();
const jobsFullData = await scrapeDescription();
console.log(jobFullData);
}
对此:
async function scrapeCraigslist() {
const jobWithHeaders = await scrapeJobHeader();
const jobsFullData = await scrapeDescription(jobWithHeaders); // <===
console.log(jobFullData);
}
此外,没有理由这样做:
return await Promise.all(...)
将其更改为:
return Promise.all(...)
无论哪种方式,您都将返回解析为相同值的Promise。基本上,async
函数内没有任何理由要做return await somePromise
。不用await
就直接返回承诺。 await
所做的所有工作(如果未由解释程序优化)是等待该诺言得以解决,从中获得价值,然后接受async
函数已经返回的诺言,并使其重视该承诺的已解决价值。与返回await
时已经拥有的诺言一样,您会得到相同的结果。
更改此:
const scrapeResults = [];
async function scrapeJobHeader() {
try {
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const resultTitle = $(element).children(".result-title");
title = resultTitle.text();
link = resultTitle.attr("href");
const datePosted = $(element).children("time").attr("datetime");
const scrapResult = {title, link, datePosted};
scrapeResults.push(scrapResult);
return scrapeResults;
});
} catch (err) {
console.error(err);
}
}
对此:
async function scrapeJobHeader() {
const scrapeResults = [];
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const resultTitle = $(element).children(".result-title");
const title = resultTitle.text();
const link = resultTitle.attr("href");
const datePosted = $(element).children("time").attr("datetime");
const scrapResult = {title, link, datePosted};
scrapeResults.push(scrapResult);
});
return scrapeResults;
}
然后,更改此内容:
scrapeCraigslist();
对此:
scrapeCraigslist().then(results => {
// use the results in here only
console.log(results);
}).catch(err => {
console.log(err);
});
然后,更改此:
async function scrapeDescription(jobWithHeaders) {
return await Promise.all(
jobWithHeaders.map(async job => {
const htmResult = await rp.get(job.url);
const $ = await cheerio.load(htmResult);
$(".print-qrcode-container").remove();
job.description = $("#postingbody").text();
})
);
}
对此:
function scrapeDescription(jobWithHeaders) {
return Promise.all(
jobWithHeaders.map(async job => {
const htmResult = await rp.get(job.url);
const $ = await cheerio.load(htmResult);
$(".print-qrcode-container").remove();
job.description = $("#postingbody").text();
return job;
});
);
}