Question

我已经在node中创建了一个脚本，该脚本使用promise结合request和cheerio来解析links列下的Province从此webpage中提取，然后重复使用这些链接从所有此类pages中的Office列中抓取所有网址，最后利用这些links来收集title来自所有此类target pages，例如本页的Cairos main Post Office。

大多数情况下，我当前的脚本卡住了。但是，有时会引发此错误UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'parent' of undefined。我检查了每个功能，发现它们分别以正确的方式工作。

尽管脚本看起来更大，但它是基于一个非常简单的逻辑构建的，该逻辑利用其着陆页中的每个links直到到达目标页的title为止。

这是我到目前为止的尝试：

const request = require('request');
const cheerio = require('cheerio');

const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';

const items = [];
const nitems = [];

let getLinks = () => {
    return new Promise((resolve, reject) => {
        request(link, function(error, response, html) {
            let $ = cheerio.load(html);
            if (error) return reject(error);
            try {
                $('.table tbody tr').each(function() {
                    items.push(base_link + $(this).find("a").attr("href"));
                });
                resolve(items);
            } catch (e) {
                reject(e);
            }
        });
    });
};

let getData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    $('.table tbody tr').each(function() {
                        nitems.push(base_link + $(this).find("a").attr("href"));
                    });
                    resolve(nitems);
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

let FetchData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    resolve($(".home-title > h2").eq(0).text());
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

getLinks().then(resultList => {
    getData(resultList).then(resultSet => {
        FetchData(resultSet).then(title =>{
            console.log(title);
        })
    })
})

如何使用目标网页上的所有链接从目标网页上抓取标题？

Answer 1

向网站所有者询问所需数据会容易得多。
他可能会理解您的请求并将其免费提供给您，而不是抓取他的网站。

P.S：我很惊讶地发现一个有关如何废弃自己的网站的问题。
  P.S2：如果您只需要所有邮局书名，我可以免费为您提供：D
  P.S3：您的错误可能正在发生，因为一段时间以来，页面没有您要使用cheerio尝试解析的元素。

Answer 2

所以问题出在2D数组上。如果您仔细检查getData函数，那么您将返回2D数组。

map返回一个数组，并且在该映射中您正在解析另一个数组nitems。

这是工作代码：

const base_link = 'https://www.egyptcodebase.com/en/';

// helper wrapper DRY
const getHtmls = (url) => {
  return new Promise((resolve, reject) => {
    request({ uri: url, method: 'GET', followAllRedirects: true } , function(error, response, html) {
      if (error) reject(error);
      else resolve(html);
    });
  })
}

let getLinks = async () => {
  const link = 'https://www.egyptcodebase.com/en/p/all';
  const items = [];
  try {
    const html = await getHtmls(link);
    let $ = cheerio.load(html);
    $('.table tbody tr').each(function() {
      items.push(base_link + $(this).find("a").attr("href"));
    });
  } catch (e) {
    // handling error here so execution can continue for good eggs
    console.error(e.message)
  }
  return items;
};

let getData = async (links) => {
  const out = [];
  try {
    const promises = links.map(nurl => getHtmls(nurl));

    const htmls = await Promise.all(promises);
    htmls.forEach(html => {
      let $ = cheerio.load(html);
      $('.table tbody tr').each(function() {
        out.push(base_link + $(this).find("a").attr("href"));
      });
    })
  } catch (e) {
    // handling error here so execution can continue for good eggs
    console.error(e.message)
  }
  return out;
}

let FetchData = async (links) => {
  const out = [];
  try {
    const promises = links.map(nurl => getHtmls(nurl));
    const htmls = await Promise.all(promises)
    htmls.forEach(html => {
      try {
        let $ = cheerio.load(html);
        out.push($(".home-title > h2").eq(0).text());
      } catch (e){
        // handling error here so execution can continue for good eggs
        console.error(e.message)
      }
    })
  } catch (e) {
    // handling error here so execution can continue for good eggs
    console.error(e.message)
  }
  return out;
}

getLinks().then(resultList => {
  getData(resultList).then(resultSet => {
    FetchData(resultSet).then(title =>{
      console.log(title);
    })
  })
})

注意：您可以使用Promise包来代替编写自己的request-promise包装器

Answer 3

代码的问题在FetchData函数中，因为在该函数中您传递链接，然后在其上使用映射。但是，如果您查看该映射函数内部并检查“ nurl”变量的值，它将是一个链接数组，并且其数据类型将是对象。根据请求函数的语义，其第一个参数应为字符串，因此，如果对“ nurl”变量进行迭代以获取值，则它将起作用。

My code snippet for one url from array

无法利用链接获取其他标题

3 个答案: