我正在编写hadoop程序,我真的不想玩弃用的类。 在线任何地方我无法找到更新的程序
org.apache.hadoop.conf.Configuration
类
org.apache.hadoop.mapred.JobConf
类。
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Test.class);
conf.setJobName("TESST");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
这就是我的main()的样子。 可以请任何人都能为我提供更新的功能。
答案 0 :(得分:18)
这是经典的WordCount示例。您会注意到其他可能没有必要的输入音,阅读您将找出哪些代码。
有什么不同?我正在使用Tool接口和GenericOptionParser来解析作业命令a.k.a:hadoop jar ....
在映射器中你会注意到一个运行的东西。您可以摆脱它,当您提供Map方法的代码时,它通常默认调用。我把它放在那里给你的信息,你可以进一步控制映射阶段。这都是使用新的API。希望对你有帮助。还有其他任何问题,请告诉我们!
import java.io.IOException;
import java.util.*;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.GenericOptionsParser;
public class Inception extends Configured implements Tool{
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
public void run (Context context) throws IOException, InterruptedException {
setup(context);
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
cleanup(context);
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
public int run(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration());
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJarByClass(WordCount.class);
job.submit();
return 0;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
ToolRunner.run(new WordCount(), otherArgs);
}
}
答案 1 :(得分:1)
同样以经典 WordCount 为例:
org.apache.hadoop.mapred.JobConf
已过时,在新版本中我们使用Configuration
和Job
来实现。
请使用org.apache.hadoop.mapreduce.lib.*
(这是新的API)而不是org.apache.hadoop.mapred.TextInputFormat
(它已经过时了)。
Mapper
和Reducer
并不是什么新鲜事,请参阅main
功能,它包含相对整体的配置,可以根据您的具体要求随时更改。
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private Text outputKey;
private IntWritable outputVal;
@Override
public void setup(Context context) {
outputKey = new Text();
outputVal = new IntWritable(1);
}
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer stk = new StringTokenizer(value.toString());
while(stk.hasMoreTokens()) {
outputKey.set(stk.nextToken());
context.write(outputKey, outputVal);
}
}
}
class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result;
@Override
public void setup(Context context) {
result = new IntWritable();
}
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val: values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public class WordCount {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
if(args.length != 2) {
System.err.println("Usage: <in> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "Word Count");
// set jar
job.setJarByClass(WordCount.class);
// set Mapper, Combiner, Reducer
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
/* Optional, set customer defined Partioner:
* job.setPartitionerClass(MyPartioner.class);
*/
// set output key
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// set input and output path
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// by default, Hadoop use TextInputFormat and TextOutputFormat
// any customer defined input and output class must implement InputFormat/OutputFormat interface
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}