hadoop mapreduce生成不同长度的子串

时间:2016-07-21 10:47:38

标签: java hadoop mapreduce shingles

使用Hadoop mapreduce我正在编写代码来获取不同长度的子串。给出字符串“ZYXCBA”和长度3的示例(使用文本文件我将输入作为“3 ZYXCBA”)。我的代码必须返回所有可能的长度为3的字符串(“ZYX”,“YXC”,“XCB”,“CBA”),长度为4(“ZYXC”,“YXCB”,“XCBA”)最后长度为5(“ZYXCB” ”, “YXCBA”)。

在地图阶段,我做了以下工作:

key =我想要的子串的长度

value =“ZYXCBA”。

所以mapper输出是

3,"ZYXCBA"
4,"ZYXCBA"
5,"ZYXCBA"

在reduce中,我使用字符串(“ZYXCBA”)和键3来获得长度为3的所有子串.4,5也是如此。结果使用字符串连接。因此,降低应该是:

3 "ZYX YXC XCB CBA"
4 "ZYXC YXCB XCBA"
5 "ZYXCB YXCBA" 

我正在使用以下命令运行我的代码:

hduser@Ganesh:~/Documents$ hadoop jar Saishingles.jar hadoopshingles.Saishingles Behara/Shingles/input Behara/Shingles/output

我的代码如下所示:

package hadoopshingles;

import java.io.IOException;
//import java.util.ArrayList;

import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


public class Saishingles{

public static class shinglesmapper extends Mapper<Object, Text, IntWritable, Text>{

        public void map(Object key, Text value, Context context
                ) throws IOException, InterruptedException {

            String str = new String(value.toString());
            String[] list = str.split(" ");
            int x = Integer.parseInt(list[0]);
            String val = list[1];
            int M = val.length();
            int X = M-1;


            for(int z = x; z <= X; z++)
            {
                context.write(new IntWritable(z), new Text(val));
            }

        }

     }


public static class shinglesreducer extends Reducer<IntWritable,Text,IntWritable,Text> {


    public void reduce(IntWritable key, Text value, Context context
            ) throws IOException, InterruptedException {
        int z = key.get();
        String str = new String(value.toString());
        int M = str.length();
        int Tz = M - z;
        String newvalue = "";
        for(int position = 0; position <= Tz; position++)
        {
            newvalue = newvalue + " " + str.substring(position,position + z);   
        }

        context.write(new IntWritable(z),new Text(newvalue));
    }
}




public static void main(String[] args) throws Exception {
      GenericOptionsParser parser = new GenericOptionsParser(args);
      Configuration conf = parser.getConfiguration();
      String[] otherArgs = parser.getRemainingArgs();

        if (otherArgs.length != 2) 
        {
          System.err.println("Usage: Saishingles <inputFile> <outputDir>");
          System.exit(2);
        }
      Job job = Job.getInstance(conf, "Saishingles");
      job.setJarByClass(hadoopshingles.Saishingles.class);
      job.setMapperClass(shinglesmapper.class);
      //job.setCombinerClass(shinglesreducer.class);
      job.setReducerClass(shinglesreducer.class);
      //job.setMapOutputKeyClass(IntWritable.class);
      //job.setMapOutputValueClass(Text.class);
      job.setOutputKeyClass(IntWritable.class);
      job.setOutputValueClass(Text.class);
      FileInputFormat.addInputPath(job, new Path(args[0]));
      FileOutputFormat.setOutputPath(job, new Path(args[1]));
      System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

减少输出而不是返回

3 "ZYX YXC XCB CBA"
4 "ZYXC YXCB XCBA"
5 "ZYXCB YXCBA" 

它正在返回

3 "ZYXCBA"
4 "ZYXCBA"
5 "ZYXCBA"

即,它提供与mapper相同的输出。不知道为什么会这样。请帮我解决这个问题,并提前感谢帮助;):) :)。

1 个答案:

答案 0 :(得分:0)

您甚至无需运行reducer即可实现此目的。你的map / reduce逻辑是错误的...转换应该在Mapper中完成。

Reduce - 在此阶段,为分组输入中的每个reduce(WritableComparable, Iterator, OutputCollector, Reporter)对调用<key, (list of values)>方法。

reduce签名中:public void reduce(IntWritable key, Text value, Context context)

应为public void reduce(IntWritable key, Iterable<Text> values, Context context)

另外,将reduce方法的最后一行更改为context.write(new IntWritable(z),new Text(newvalue));context.write(key,new Text(newvalue)); - 您已经有来自mapper的Intwritable Key,我不会创建new一个。< / p>

给定输入:

3 "ZYXCBA"
4 "ZYXCBA"
5 "ZYXCBA"

Mapper作业将输出:

3   "XCB YXC ZYX"
4   "XCBA YXCB ZYXC"
5   "YXCBA ZYXCB"

<强> MapReduceJob:

import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SubStrings{

    public static class SubStringsMapper extends Mapper<Object, Text, IntWritable, Text> {

        @Override
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

            String [] values = value.toString().split(" ");
            int len = Integer.parseInt(values[0].trim());
            String str = values[1].replaceAll("\"", "").trim();

            int endindex=len;
            for(int i = 0; i < len; i++)
            {
                endindex=i+len;
                if(endindex <= str.length())
                    context.write(new IntWritable(len), new Text(str.substring(i, endindex))); 
            }

        }   
    }

    public  static class SubStringsReducer extends Reducer<IntWritable, Text, IntWritable, Text> {

        public void reduce(IntWritable key, Iterable<Text> values, Context context) 
                throws IOException, InterruptedException {

            String str="\""; //adding starting quotes
            for(Text value: values)
                str += " " + value;

            str=str.replace("\" ", "\"") + "\""; //adding ending quotes
            context.write(key, new Text(str));
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "get-possible-strings-by-length");

        job.setJarByClass(SubStrings.class);
        job.setMapperClass(SubStringsMapper.class); 
        job.setReducerClass(SubStringsReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        FileSystem fs = null;
        Path dstFilePath = new Path(args[1]);
        try {
            fs = dstFilePath.getFileSystem(conf);
            if (fs.exists(dstFilePath))
                fs.delete(dstFilePath, true);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

        job.waitForCompletion(true);
    } 
}