我在使用以下映射器在简单的Hadoop流中使用NodeJS包cheerio时遇到问题:
#!/usr/bin/env nodejs
//mapper.js
var stdin = process.openStdin();
var stdout = process.stdout;
var input = '';
var cheerio = require('cheerio');
stdin.setEncoding('utf8');
stdin.on('data', function(data) {
if (data) {
input += data;
while (input.match(/\r?\n/)) {
input = RegExp.rightContext;
proc(RegExp.leftContext);
}
}
});
stdin.on('end', function() {
if (input) {
proc(input);
}
});
function proc(line) {
var words = line.split(' ');
stdout.write(words[8] + ',1\n');
}
还原剂:
#!/usr/bin/env nodejs
//reducer.js
var stdin = process.openStdin();
var stdout = process.stdout;
var counter = {};
var input = '';
stdin.setEncoding('utf8');
stdin.setEncoding('utf8');
stdin.on('data', function(data) {
if (data) {
input += data;
while (input.match(/\r?\n/)) {
input = RegExp.rightContext;
proc(RegExp.leftContext);
}
}
});
stdin.on('end', function() {
if (input) proc(input);
for (var k in counter) {
stdout.write(k + ':' + counter[k] + '\n');
}
});
function proc(line) {
var words = line.split(',');
var word = words[0];
var count = parseInt(words[1]);
if (!counter[word]) counter[word] = 1;
else counter[word] += count;
}
以下hadoop命令:
#!/bin/bash
hadoop jar /home/user/hadoop-2.5.2/share/hadoop/tools/lib/hadoop-streaming-2.5.2.jar \
-file mapper.js -file reducer.js \
-input /input.txt -output index_out -mapper mapper.js -reducer reducer.js
流媒体脚本完美运行,没有“var cheerio = require('cheerio');”线。但是当包含该行时,它会产生错误java异常:
14/11/25 07:52:29 INFO mapreduce.Job: Running job: job_1416872985978_0001
14/11/25 07:52:35 INFO mapreduce.Job: Job job_1416872985978_0001 running in uber mode : false
14/11/25 07:52:35 INFO mapreduce.Job: map 0% reduce 0%
14/11/25 07:52:40 INFO mapreduce.Job: Task Id : attempt_1416872985978_0001_m_000000_0, Status : FAILED
Error: java.lang.RuntimeException: PipeMapRed.waitOutputThreads(): subprocess failed with code 8
at org.apache.hadoop.streaming.PipeMapRed.waitOutputThreads(PipeMapRed.java:320)
at org.apache.hadoop.streaming.PipeMapRed.mapRedFinished(PipeMapRed.java:533)
at org.apache.hadoop.streaming.PipeMapper.close(PipeMapper.java:130)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:61)
at org.apache.hadoop.streaming.PipeMapRunner.run(PipeMapRunner.java:34)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:430)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:342)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1614)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)
呼叫hadoop和NodeJS专家,我将不胜感激任何帮助。谢谢。