通过Heroku Scheduler(Node.js)运行的刮擦请求失败了一半

时间:2016-12-25 04:13:33

标签: javascript node.js heroku request cheerio

我设置了一个非常基本的网络剪贴簿,以便为我的祖父检查Costco.com上特定商品的库存。它在本地工作得很好,但是当我通过Heroku运行时,它失败了(似乎有50%的时间)。这是刮刀的代码

const task = () => {

  // toggle so doesn't send message multiple times if continuously available
  let alreadyAvailable = false;

  let url = 'http://www.costco.com/Kirkland-Signature-Four-Piece-Urethane-Cover-Golf-Ball,-2-dozen.product.100310467.html';
  request(url, function(error, response, html){

    let $ = cheerio.load(html);

    if(error){
      throw new Error(error);
    }

    if ( $('#product-page #product-details #ctas #add-to-cart input[type="button"]')['0'].attribs.value === 'Out of Stock') {
      alreadyAvailable = false;
      console.log("still out of stock");
    } else {
      if (alreadyAvailable === false) {
        sendMessage();
        alreadyAvailable = true;
      }
    }

  });
};

以下是日志

2016-12-25T03:48:39.675549+00:00 heroku[scheduler.5440]: Starting process with command `node scraper.js`
2016-12-25T03:48:40.262503+00:00 heroku[scheduler.5440]: State changed from starting to up
2016-12-25T03:48:41.509416+00:00 app[scheduler.5440]: /app/scraper.js:34
2016-12-25T03:48:41.509432+00:00 app[scheduler.5440]:     if ( $('#product-page #product-details #ctas #add-to-cart input[type="button"]')['0'].attribs.value === 'Out of Stock') {
2016-12-25T03:48:41.509433+00:00 app[scheduler.5440]:                                                                                          ^
2016-12-25T03:48:41.509433+00:00 app[scheduler.5440]:
2016-12-25T03:48:41.509434+00:00 app[scheduler.5440]: TypeError: Cannot read property 'attribs' of undefined
2016-12-25T03:48:41.509434+00:00 app[scheduler.5440]:     at Request._callback (/app/scraper.js:34:90)
2016-12-25T03:48:41.509435+00:00 app[scheduler.5440]:     at Request.self.callback (/app/node_modules/request/request.js:186:22)
2016-12-25T03:48:41.509436+00:00 app[scheduler.5440]:     at emitTwo (events.js:106:13)
2016-12-25T03:48:41.509436+00:00 app[scheduler.5440]:     at Request.emit (events.js:191:7)
2016-12-25T03:48:41.509436+00:00 app[scheduler.5440]:     at Request.<anonymous> (/app/node_modules/request/request.js:1081:10)
2016-12-25T03:48:41.509437+00:00 app[scheduler.5440]:     at emitOne (events.js:96:13)
2016-12-25T03:48:41.509437+00:00 app[scheduler.5440]:     at Request.emit (events.js:188:7)
2016-12-25T03:48:41.509438+00:00 app[scheduler.5440]:     at IncomingMessage.<anonymous> (/app/node_modules/request/request.js:1001:12)
2016-12-25T03:48:41.509438+00:00 app[scheduler.5440]:     at IncomingMessage.g (events.js:291:16)
2016-12-25T03:48:41.509439+00:00 app[scheduler.5440]:     at emitNone (events.js:91:20)
2016-12-25T03:48:41.509439+00:00 app[scheduler.5440]:     at IncomingMessage.emit (events.js:185:7)
2016-12-25T03:48:41.509439+00:00 app[scheduler.5440]:     at endReadableNT (_stream_readable.js:974:12)
2016-12-25T03:48:41.509440+00:00 app[scheduler.5440]:     at _combinedTickCallback (internal/process/next_tick.js:74:11)
2016-12-25T03:48:41.509440+00:00 app[scheduler.5440]:     at process._tickCallback (internal/process/next_tick.js:98:9)
2016-12-25T03:48:41.560539+00:00 heroku[scheduler.5440]: State changed from up to complete
2016-12-25T03:48:41.550655+00:00 heroku[scheduler.5440]: Process exited with status 1
2016-12-25T03:58:42.438807+00:00 app[api]: Starting process with command `node scraper.js` by user scheduler@addons.heroku.com
2016-12-25T03:58:43.701468+00:00 heroku[scheduler.5038]: Starting process with command `node scraper.js`
2016-12-25T03:58:44.312279+00:00 heroku[scheduler.5038]: State changed from starting to up
2016-12-25T03:58:45.769564+00:00 app[scheduler.5038]: still out of stock
2016-12-25T03:58:45.827867+00:00 heroku[scheduler.5038]: State changed from up to complete
2016-12-25T03:58:45.814921+00:00 heroku[scheduler.5038]: Process exited with status 0

您可以看到,有时我会在if-block中获取控制台日志,而其他人则会收到类型错误,因为它试图从不存在的html元素中读取属性。我当时认为这可能是一个异步问题,但我不确定如何修复它。我假设Request没有运行回调,直到它获得了所有的html。

1 个答案:

答案 0 :(得分:0)

这里的问题是好市多的网站正在返回。

通过Cheerio解析DOM时,您的代码失败了。在您的情况下,这意味着您尝试抓取的特定HTML实际上并不存在(这就是错误所说的内容)。

这可能是由一些可能的事情引起的:

  • Costco呈现的页面与您预期的不同(可能它认为您是机器人,或正在进行一些限制)。
  • 您正在接收重定向或其他类型的非错误HTTP状态代码,并且您正在寻找的HTML不存在。
  • Costco的网站动态更改HTML以防止人们刮擦。

如果我是你,我会怎么做:

  • 让您的流程在任务运行时记录所有页面的HTML。
  • 下次您的流程失败时,将Heroku日志中的HTML复制到本地编辑器中,看看它返回的内容。

我愿意打赌你会感到惊讶=)