使用MapReduce将Text转换为序列会创建垃圾字符

时间:2015-01-09 11:26:35

标签: java hadoop mapreduce hadoop2 sequencefile

我正在使用MapReduce将文本文件转换为Sequence文件并返回Text。 我在每行的开头都得到了一些数字。如何删除它们或阻止它们进入我的输出。

e.g。文字:

d001    Marketing

d002    Finance

d003    Human Resources

转换后的序列文件:

0   d001    Marketing

15  d002    Finance\n

28  d003    Human Resources

从序列文件转换文本

0   d001    Marketing

15  d002    Finance

28  d003    Human Resources

我希望删除0 15 28个值。

我正在使用以下代码:

public class FormatConverterTextToSequenceDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.printf("Two parameters are required for FormatConverterTextToSequenceDriver-<input dir> <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterTextToSequenceDriver.class);
    job.setJobName("Create Sequence File, from text file");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(FormatConverterMapper.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }
 -----------------------------------------------------------------
public class FormatConverterSequenceToTextDriver extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out
          .printf("Two parameters need to be supplied - <input dir> and <output dir>\n");
      return -1;
    }

    Job job = new Job(getConf());
    job.setJarByClass(FormatConverterSequenceToTextDriver.class);
    job.setJobName("Convert Sequence File and Output as Text");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(FormatConverterMapper.class);
    job.setNumReduceTasks(0);

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }
 -----------------------------------------------------------------
public class FormatConverterMapper extends
    Mapper<LongWritable, Text, LongWritable, Text> {

  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.write(key, value);
  }
}

感谢任何帮助。

2 个答案:

答案 0 :(得分:0)

当您从序列文件转换回文本时,您不想添加您编写的长片。所以只需将您的写入方法调整为:

 @Override
 public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.write(value, null);
  }

输出应该只是值本身。

答案 1 :(得分:0)

写下工作代码以供参考。

由4个类组成

SequenceFileGenDriver - 从文本文件创建序列文件的驱动程序代码,包含2个参数,源文本文件路径和目标序列文件路径

SequenceFileGenMapper - 将文本文件转换为序列文件的映射器。

TextFileGenDriver - 将序列文件转换为文本文件的驱动程序代码,包含2个参数,输入序列文件路径和输出文本文件路径

TextFileGenMapper - 将序列文件转换为文本文件的映射器

    public class SequenceFileGenDriver {

            public static void main(String[] args) throws Exception {
                    Configuration conf = new Configuration();
                    Job job = new Job(conf);
                    job.setJarByClass(SequenceFileGenDriver.class);
                    job.setMapperClass(SequenceFileGenMapper.class);
                    job.setNumReduceTasks(0);
                    job.setInputFormatClass(TextInputFormat.class);
                    job.setOutputFormatClass(SequenceFileOutputFormat.class);
                    job.setOutputKeyClass(Text.class);
                    job.setOutputValueClass(NullWritable.class);
                    TextInputFormat.addInputPath(job, new Path(args[0]));
                    SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
                    job.waitForCompletion(true);
            }

    }

    public class SequenceFileGenMapper extends
                    Mapper<LongWritable, Text, Text, NullWritable> {
            private final static NullWritable nullWritable = NullWritable.get();
            public void map(LongWritable key, Text value, Context context)
                            throws IOException, InterruptedException {
                            context.write(value, nullWritable);
            }
    }


    public class TextFileGenDriver {

            public static void main(String[] args) throws Exception {
                    Configuration conf = new Configuration();
                    Job job = new Job(conf);
                    job.setJarByClass(TextFileGenDriver.class);
                    job.setMapperClass(TextFileGenMapper.class);
                    job.setInputFormatClass(SequenceFileInputFormat.class);
                    job.setOutputFormatClass(TextOutputFormat.class);
                    job.setOutputKeyClass(Text.class);
                    job.setOutputValueClass(NullWritable.class);
                    job.setNumReduceTasks(0);
                    SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
                    TextOutputFormat.setOutputPath(job, new Path(args[1]));
                    job.waitForCompletion(true);
            }

    }

public class TextFileGenMapper extends
             Mapper<Text, NullWritable, Text, NullWritable> {
        private final static NullWritable nullWritable = NullWritable.get();
            public void map(Text key, NullWritable value, Context context)
                           throws IOException, InterruptedException {
                       context.write(key, nullWritable);
        }
}