Question

我正在摆弄使用Node.js从电子商务网站获取数据。我使用Request检索页面的DOM，Cheerio进行服务器端DOM选择。

const cheerio = require('cheerio');
const request = require('request');

// takes a URL, scrapes the page, and returns an object with the data
let scrapePage = (url) => {

    return new Promise((resolve, reject) => {

        request(url, (error, resp, body) => {

            if(error){
                reject(error);
            };

            let $ = cheerio.load(body); 
            let $url = url;
            let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();

            let obj = {
                url: $url,
                price: $price
            }

            resolve(obj);

        });

    });

};

// Runs scrapePage in a loop
// There is a variable called arrayOfURLs defined elsewhere that contains 100s of URLs

for( let i = 0; i < arrayOfURLs.length; i++){
    scrapePage(arrayOfURLs[i])
        .then((obj) => {
            //write to a file
        })
        .catch((error) => {
        })
};

问题是我发送请求的服务器有时会发送回空白数据，我假设是因为我发送了太多请求而没有任何停顿。由于JS的异步性质，我很难弄清楚如何在循环的每次迭代之间添加有效延迟。仅以同步方式添加setTimeOut是不够的，因为setTimeOut本身是异步的，我在服务器上运行它，因此没有Window对象。

修改

上面的代码是我正在处理的简化版本。整个代码是这样的：

app.js

const fs = require('fs');
const path = 'urls.txt';
const path2 = 'results.txt';
const scraper = require('./scraper');

let scrapePage = (url) => {
    scraper.scrapePage(url)
        .then((obj) => {
            // console.log('obj from the scraper with Promises was received');
            // console.log(obj);
            // console.log('writing obj to a file');
            fs.appendFile(path2, JSON.stringify(obj) + ', ', (error) => {
                if(error){
                    console.log(error);
                } else {
                    // console.log('Successfully wrote to ' + path2);
                }
            })
        })
        .catch((error) => {
            console.log('There was an error scraping obj: ');
            console.log(error);
        })  
}

fs.readFile(path, 'utf8', (err, data) => {

  if (err){
    throw err;
  };

  var urlArray = JSON.parse(data);

  // this returns an Unexpected Identifier error    
  // const results = await Promise.all(urlArray.map(scrapePage));

  // this returns an Unexpected Token Function error
  // async function scrapePages(){
  //    const results = await Promise.all(urlArray.map(scrapePage));
  // };

});

scraper.js

const request = require('request');
const cheerio = require('cheerio');

exports.scrapePage = (url) => {
    return new Promise((resolve, reject) => {
        request(url, (error, resp, body) => {
            if(error){
                reject(error);
            };

            let $ = cheerio.load(body); 
            let $url = url;

            let $price = $('#rt-mainbody > div > div.details > div.itemData > div:nth-child(4) > div.description').text();

            let obj = {
                url: $url,
                price: $price
            }

            resolve(obj);

        })
    })
}

Answer 1

在我看来，在发送服务器响应之前，您并没有等待您的承诺解决。您可以使用async / await例如

完全消除for循环

const results = await Promise.all(arrayOfURLs.map(scrapePage));

Answer 2

如果您希望活动连接数不超过x，则可以使用throttle。或者，如果您希望每秒不超过x金额，则可以使用throttlePeriod。

如果只有一个请求失败，使用Promise.all将永远不会调用您的解析处理程序，因此您可以捕获任何错误并返回一个Fail对象

const Fail = function(details){this.details=details;};
const max10 = throttle(10)(scrapePage);//max 10 active connections
//const fivePerSecond = throttlePeriod(2,1000)(scrapePage); //start no more than 2 per second
Promise.all(
  arrayOfURLs.map(
    url =>
      max10(url)
      .catch(err=>new Fail([err,url]))
  )
)
.then(
  results =>{
    successes =
      results.filter(
        result=>(result&&result.constructor)!==Fail
      );
    failed =
      results.filter(
        result=>(result&&result.constructor)===Fail
      )
  }
);

Answer 3

const cheerio = require('cheerio');
const request = require('request');
let scrapePage = (url) => {

return new Promise((resolve, reject) => {

    request(url, (error, resp, body) => {

        if(error){
            reject(error);
            return;
        };

        if(!body) {
             reject('Empty Body');
             return;
        }


        let $ = cheerio.load(body); 

        let $url = url;
        let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();

        let obj = {
            url: $url,
            price: $price
        }

        resolve(obj);

    });

});
};

function processUrl(url){
 scrapePage(url)
    .then((obj) => {
        //write to a file
        if(i < arrayOfURLs.length) 
            processUrl(arrayOfURLs.pop())
    })
    .catch((error) => {
       arrayOfURLs.unshift(url);
        if(i < arrayOfURLs.length)  // put this in finally block
            processUrl(arrayOfURLs.pop())
    })
};
processUrl(arrayOfURLs.pop());

Here we can use arrayOfUrls arrays as queue and if we received an error or blank page, we push this URL in array again. in that way we can process every URL in a synchronous fashion.

如何在Javascript for循环中添加服务器端延迟？

3 个答案: