我已经在一个程序中实现了ChainMapper,其中有两个映射器和一个reducer。以下是ChainMapper实现方式的代码:
该代码旨在计算出现的单词,即WordCount。
1st Mapper将从输入文本文件中读取文件并拆分每个文本并将其存储在Context中。
第二个Mapper将获得1stMapper的输出并将所有关键文本数据转换为小写关键文本数据。
小写键文本数据将存储在Context中。
Reducer将从2ndMapper获取值,相同的键相关值将转到一个reducer任务。
在Reducer中,我们只是使用给定的键进行字数统计操作。
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//implementing CHAIN MAPREDUCE without using custom format
//SPLIT MAPPER
class SplitMapper extends Mapper<Object,Text,Text,IntWritable>
{
StringTokenizer xs;
private IntWritable dummyValue=new IntWritable(1);
//private String content;
private String tokens[];
@Override
public void map(Object key,Text value,Context context)throws IOException,InterruptedException{
// xs=new StringTokenizer(value.toString()," ");
// while(xs.hasMoreTokens())
// {
// content=(String)xs.nextToken();
// }
tokens=value.toString().split(" ");
for(String x:tokens)
{
context.write(new Text(x), dummyValue);
}
}
}
//UPPER CASE MAPPER
class UpperCaseMapper extends Mapper<Text,IntWritable,Text,IntWritable>
{
@Override
public void map(Text key,IntWritable value,Context context)throws IOException,InterruptedException{
String val=key.toString().toUpperCase();
Text newKey=new Text(val);
context.write(newKey, value);
}
}
//ChainMapReducer
class ChainMapReducer extends Reducer<Text,IntWritable,Text,IntWritable>
{
private int sum=0;
@Override
public void reduce(Text key,Iterable<IntWritable>values,Context context)throws IOException,InterruptedException{
for(IntWritable value:values)
{
sum+=value.get();
}
context.write(key, new IntWritable(sum));
}
}
public class FirstClass extends Configured implements Tool{
static Configuration cf;
public int run (String args[])throws IOException,InterruptedException,ClassNotFoundException{
cf=new Configuration();
//bypassing the GenericOptionsParser part and directly running into job declaration part
Job j=Job.getInstance(cf);
/**************CHAIN MAPPER AREA STARTS********************************/
Configuration splitMapConfig=new Configuration(false);
//below we add the 1st mapper class under ChainMapper Class
ChainMapper.addMapper(j, SplitMapper.class, Object.class, Text.class, Text.class, IntWritable.class, splitMapConfig);
//configuration for second mapper
Configuration upperCaseConfig=new Configuration(false);
//below we add the 2nd mapper that is the lower case mapper to the Chain Mapper class
ChainMapper.addMapper(j, UpperCaseMapper.class, Text.class, IntWritable.class, Text.class, IntWritable.class, upperCaseConfig);
/**************CHAIN MAPPER AREA FINISHES********************************/
//now proceeding with the normal delivery
j.setJarByClass(FirstClass.class);
j.setCombinerClass(ChainMapReducer.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(IntWritable.class);
Path p=new Path(args[1]);
//set the input and output URI
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, p);
p.getFileSystem(cf).delete(p, true);
return j.waitForCompletion(true)?0:1;
}
public static void main(String args[])throws Exception{
int res=ToolRunner.run(cf, new FirstClass(), args);
System.exit(res);
}
}
现在,我有一个问题。 ChainMapper能否真正减轻多个MR工作的负担,或者任何人都拥有ChainMapper实际使用的概念和知识?
谢谢你:)