Question

所以这是我用来抓取我的页面的代码（我正在使用请求和cheerio模块：

for (let j = 1; j < nbRequest; j++)
{
  const currentPromise = new Promise((resolve, reject) => {
    request(
      `https://www.url${j}`,
      (error, response, body) => {
        if (error || !response) {
          console.log("Error: " + error);
        }

    console.log("Status code: " + response.statusCode + ", Connected to the page");

    var $ = cheerio.load(body);
    let output = {
      ranks: [],
      names: [],
      numbers: [],
    };

    $('td.rangCell').each(function( index ) {
      if ($(this).text().trim() != "Rang")
      {
        output.ranks.push($(this).text().trim().slice(0, -1));
        nbRanks = nb_ranks+1;
      }

    });

    $('td.nameCell:has(label)').each(function( index ) {
      output.names.push($(this).find('label.nameValue > a').text().trim());
    });

    $('td.numberCell').each(function( index ) {
      if ($(this).text().trim() != "Nombre")
      {
        output.numbers.push($(this).text().trim());
      }
    });

    console.log("HERE 1");
    return resolve(output);
  }
);


 });
    promises.push(currentPromise);
   }

之后，我正在使用节点模块解析并将结果保存在csv文件中。在这一点上，我已经能够抓取大约100页，但是当涉及到更大的数字（1000+）时，我收到500响应意味着我被踢了，我想。所以我认为最好的解决方案是延迟请求，但我找不到解决方案。你们有任何想法以及代码的外观吗？

Answer 1

您正在寻找的是＆＃34;控制流＆＃34;，您可以使用async.queue来实现此目的。

如果将每个请求添加到队列，则可以使用工作量控制并行请求的数量。您可以将setTimeouts添加到请求回调的最后部分，以实现延迟请求。

此外，我建议使用＆＃34;抓取工具＆＃34;包（而不是建立自己的包），例如npm-crawler因为他们发布了速率限制，并且已经处理了您可能会面临的其他事情:)例如用户代理池

更新

const async = require("async");
const delayTime = 1500; //wait 1,5 seconds after every new request

getRequestPromise(csvLine){
 return new Promise( make you request here );
}

const asyncQueue = async.queue(function(task, callback) {
 getRequestPromise(task).then(_ => {
  setTimeout(() => {
   callback(null);
  }, delayTime);
 });
}, 1); //1 one request at a time

for(csv){ //pseudo
 asyncQueue.push(csv[i], () => {});
}

asyncQueue.drain = () => {
 console.log("finished.");
};

使用请求和cheerio模块延迟请求

1 个答案: