内存不足Hadoop处理bz2上的Wikipedia转储文件

时间:2017-03-26 02:45:57

标签: hadoop mapreduce wikipedia

我正在尝试对维基百科转储文件执行地图缩减过程,我读到hadoop将解压缩文件并将其拆分为在映射器上进行处理。

然而,该过程没有完成,日志显示Out of Memory错误。

我已经阅读了一个项目https://github.com/whym/wikihadoop/wiki,它提供了一个名为StreamWikiDumpInputFormat的InputFormat,但我不能开箱即用,因为我的mappers和reducer是为Hadoop 2.7实现的。

有人可以帮助我吗?

修改

我的Job类就是这个

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import fiberClusterer.hadoop.fs.io.WholeFileInputFormat;
import uniandes.mapRed.WCMapper;
import uniandes.mapRed.WCReducer;

public class WordCounter {
    public static void main(String[] args) {
        if (args.length < 2) {
            System.exit(-1);
        }
        String entrada = args[0];
        String salida = args[1];
    try {
        ejecutarJob(entrada, salida);
    } catch (Exception e) {
        e.printStackTrace();
    }

}

public static void ejecutarJob(String entrada, String salida)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration conf = new Configuration();
    Job wcJob = Job.getInstance(conf, "WordCounter Job");
    wcJob.setJarByClass(WordCounter.class);

    wcJob.setMapperClass(WCMapper.class);

    wcJob.setMapOutputKeyClass(Text.class);
    wcJob.setMapOutputValueClass(Text.class);
    wcJob.setReducerClass(WCReducer.class);
    wcJob.setOutputKeyClass(Text.class);
    wcJob.setOutputValueClass(Text.class);
    org.apache.hadoop.mapreduce.lib.input.TextInputFormat
    WholeFileInputFormat.setInputPaths(wcJob, new Path(entrada));
    wcJob.setInputFormatClass(WholeFileInputFormat.class);

    TextOutputFormat.setOutputPath(wcJob, new Path(salida));
    wcJob.setOutputFormatClass(TextOutputFormat.class);
    wcJob.waitForCompletion(true);
    System.out.println(wcJob.toString());
    }
}

我的映射器非常简单:

import java.io.IOException;
import java.util.Date;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WCMapper extends Mapper<Text, Text, Text, Text> {
Log log = LogFactory.getLog(WCMapper.class);

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {

    String lines[] = value.toString().split("\\r?\\n");

    log.info("line");
    for (String line : lines) {
        log.info("line");
        if (line.contains("name")) {
            context.write(new Text((new Date()).toString()), new Text(line));
        }
    }
}
}

也是我的减速机

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WCReducer extends Reducer<Text, Text, Text, Text> {

@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

    for (Text iw : values) {
        context.write(new Text(""), new Text(iw));
    }
}
}

当我用纱线检查日志时,这是输出:

2017-03-26 12:37:07,266 FATAL [main] org.apache.hadoop.mapred.YarnChild: Error running child : java.lang.OutOfMemoryError: Java heap space
    at java.util.Arrays.copyOf(Arrays.java:3332)
    at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:137)
    at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:121)
    at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:421)
    at java.lang.StringBuilder.append(StringBuilder.java:136)
    at fiberClusterer.hadoop.fs.io.MyWholeFileReader.nextKeyValue(MyWholeFileReader.java:104)
    at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.nextKeyValue(MapTask.java:556)
    at org.apache.hadoop.mapreduce.task.MapContextImpl.nextKeyValue(MapContextImpl.java:80)
    at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.nextKeyValue(WrappedMapper.java:91)
    at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
    at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

1 个答案:

答案 0 :(得分:0)

  

我不知道为什么要在你的设置中设置所有这些WholeFileFormats   码。 Hadoop Mapreduce框架负责处理压缩文件,   如果像bz2和lzo那样可分裂压缩。你需要确定的是   文件扩展名必须正确。在这种情况下bz2或bzip2。以下   代码适用于bz2文件扩展名。

     

JobDriver

Arg:
test.bz2 output


package tryout.mapred;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

/**
 * @author ramesh.b
 */
public class JobDriver extends Configured implements Tool {


    public static void main(String[] args) throws Exception {

        long start = System.currentTimeMillis();
        int res = ToolRunner.run(new Configuration(), new JobDriver(), args);
        long end = System.currentTimeMillis();

        System.out.println("Time spent in millis " + (end - start));
        System.exit(res);
    }


    @SuppressWarnings("deprecation")
    @Override
    public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        try {
            String inputPath = args[0];
            Path outputPath = new Path(args[1]);

            Configuration conf = getConf();

            Job job = new Job(conf);
            job.setJarByClass(JobDriver.class);
            job.setJobName("Simple.0.0");
            job.setReducerClass(SimpleReducer.class);

            job.setOutputFormatClass(TextOutputFormat.class);

            job.setMapperClass(SimpleMapper.class);
            job.setInputFormatClass(TextInputFormat.class);

            FileSystem outfs = outputPath.getFileSystem(conf);

            if (outfs.exists(outputPath)) {
                outfs.delete(outputPath, true);
                log.info("deleted " + outputPath);
            }

            FileInputFormat.addInputPaths(job, inputPath);

            LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
            FileOutputFormat.setOutputPath(job, outputPath);

            return job.waitForCompletion(true) ? 0 : 1;

        } catch (Exception e) {
            e.printStackTrace();
            return 1;
        }
    }
}
  

SimpleMapper.java

package tryout.mapred;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class SimpleMapper extends Mapper<LongWritable, Text, LongWritable, Text> {

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString().trim();
        if (line.contains("xylophone"))
            context.write(key, value);
    }

}
  

SimpleReducer

package tryout.mapred;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Map;


public class SimpleReducer extends Reducer<LongWritable, Text, NullWritable, Text> {

    @Override
    protected void reduce(LongWritable key, Iterable<Text> values, Context context)
            throws IOException, InterruptedException {
        for (Text line : values) {
            context.write(NullWritable.get(), line);
        }
    }
}