如何使用带有hadoop流的NodeJS cheerio

时间:2014-11-25 00:04:44

标签: node.js hadoop streaming

我在使用以下映射器在简单的Hadoop流中使用NodeJS包cheerio时遇到问题:

#!/usr/bin/env nodejs
//mapper.js
var stdin = process.openStdin();
var stdout = process.stdout;
var input = '';
var cheerio = require('cheerio');

stdin.setEncoding('utf8');
stdin.on('data', function(data) {
  if (data) {
    input += data;
    while (input.match(/\r?\n/)) {
      input = RegExp.rightContext;
      proc(RegExp.leftContext);
    }
  }
});

stdin.on('end', function() {
  if (input) {
    proc(input);
  }
});

function proc(line) {
  var words = line.split(' ');
  stdout.write(words[8] + ',1\n');
}

还原剂:

#!/usr/bin/env nodejs
//reducer.js     
var stdin = process.openStdin();
var stdout = process.stdout;
var counter = {};
var input = '';

stdin.setEncoding('utf8');
stdin.setEncoding('utf8');
stdin.on('data', function(data) {
  if (data) {
    input += data;
    while (input.match(/\r?\n/)) {
      input = RegExp.rightContext;
      proc(RegExp.leftContext);
    }
  }
});

stdin.on('end', function() {
  if (input) proc(input);
  for (var k in counter) {
    stdout.write(k + ':' + counter[k] + '\n');
  }
});

function proc(line) {
  var words = line.split(',');
  var word = words[0];
  var count = parseInt(words[1]);
  if (!counter[word]) counter[word] = 1;
  else counter[word] += count;
}

以下hadoop命令:

#!/bin/bash 
hadoop jar /home/user/hadoop-2.5.2/share/hadoop/tools/lib/hadoop-streaming-2.5.2.jar \
-file mapper.js -file reducer.js \
-input /input.txt -output index_out -mapper mapper.js -reducer reducer.js

流媒体脚本完美运行,没有“var cheerio = require('cheerio');”线。但是当包含该行时,它会产生错误java异常:

14/11/25 07:52:29 INFO mapreduce.Job: Running job: job_1416872985978_0001
14/11/25 07:52:35 INFO mapreduce.Job: Job job_1416872985978_0001 running in uber mode : false
14/11/25 07:52:35 INFO mapreduce.Job:  map 0% reduce 0%
14/11/25 07:52:40 INFO mapreduce.Job: Task Id : attempt_1416872985978_0001_m_000000_0, Status : FAILED
Error: java.lang.RuntimeException: PipeMapRed.waitOutputThreads(): subprocess failed with code 8
    at org.apache.hadoop.streaming.PipeMapRed.waitOutputThreads(PipeMapRed.java:320)
    at org.apache.hadoop.streaming.PipeMapRed.mapRedFinished(PipeMapRed.java:533)
    at org.apache.hadoop.streaming.PipeMapper.close(PipeMapper.java:130)
    at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:61)
    at org.apache.hadoop.streaming.PipeMapRunner.run(PipeMapRunner.java:34)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:430)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:342)
    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:415)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1614)
    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)

呼叫hadoop和NodeJS专家,我将不胜感激任何帮助。谢谢。

0 个答案:

没有答案