使用mapreduce清理数据,但超出物理内存限制

时间:2016-12-30 02:14:56

标签: hadoop mapreduce

我试图在我的集群中使用hadoop mapreduce运行数据清理程序。但它以

结尾
Container [pid=5932,containerID=container_1480660624412_0297_02_000001] is running beyond physical memory limits. Current usage: 2.0 GB of 2 GB physical memory used; 31.8 GB of 4.2 GB virtual memory used. Killing container.

我已经通过设置

更改了配置
    job.getConfiguration().setInt("mapreduce.map.memory.mb",3072);
    job.getConfiguration().setInt("mapreduce.reduce.memory.mb",6144);
    job.getConfiguration().setStrings("mapreduce.map.java.opts","-Xmx3072m");
    job.getConfiguration().setStrings("mapreduce.reduce.java.opts","-Xmx6144m");

但似乎无法正常工作。错误始终是'当前使用情况:使用2.0 GB的2 GB物理内存;使用31.8 GB的4.2 GB虚拟内存'。

我想知道如何解决这个问题。也许有人可以给我一个基本的数据模式,因为我认为我的方法是不灵活的。非常感谢你。

数据大小:6TB 群集有10台服务器

当数据大小很小,如30GB时。程序没问题。

mapred-site.xml中

```

<property>
  <name>mapreduce.map.memory.mb</name>
    <value>3072</value>
    </property>
    <property>
      <name>mapreduce.reduce.memory.mb</name>
        <value>3072</value>
        </property>
        <property>
          <name>mapreduce.map.java.opts</name>
            <value>-Xmx3072m</value>
            </property>
            <property>
              <name>mapreduce.reduce.java.opts</name>
                <value>-Xmx6144m</value>
                </property>
<property>

``` 我的代码

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.yarn.conf.YarnConfiguration;


public class DataCleanIdIconWeb1{
    public static class QLMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

        Text outputValue = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

                // in map just processes each line of the string and outputs it
            }
        }

    }
    public static class QLCombiner extends Reducer<Text, NullWritable, Text, NullWritable> {
            @Override
            protected void reduce(Text key, Iterable<NullWritable> values,
                    Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {

    //              String line = key.toString();
    //              String[] split = line.split("\t");


            context.write(key, NullWritable.get());

            }
    }
    public static class QLReducer extends Reducer<Text, NullWritable, Text, NullWritable> {

        private MultipleOutputs<Text, NullWritable> mos;

        @Override
        protected void setup(Reducer<Text, NullWritable, Text, NullWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            super.setup(context);
            mos = new MultipleOutputs<Text, NullWritable>(context);
        }

        @Override
        protected void cleanup(Reducer<Text, NullWritable, Text, NullWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            super.cleanup(context);
            mos.close();
        }

        @Override
        protected void reduce(Text key, Iterable<NullWritable> values, Context context)
                throws IOException, InterruptedException {

            String line = key.toString();
            String[] split = line.split("\t");


            if (split.length == 5) {
                // mos.write("iconRecord", key, NullWritable.get());
                mos.write("iconRecord", key, NullWritable.get(), "iconRecord/icon");

            } else if (split.length == 1) {
                // mos.write("allID", key, NullWritable.get());
                mos.write("AllID", key, NullWritable.get(), "AllID/AllID");
            } else { // split.length == 6
                // mos.write("webRecord", key, NullWritable.get());
                mos.write("webRecord", key, NullWritable.get(), "webRecord/web");
            }

        }
    }

    public static void run(String originalDataPath, String dataCleanOutputFile) throws Exception {
        System.out.println("Start dataClean");
        //long startTime = System.currentTimeMillis();

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);


        job.setJarByClass(DataCleanIdIconWeb1.class);
        job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
        job.getConfiguration().setClass("mapreduce.map.output.compress.codec", GzipCodec.class, CompressionCodec.class);

        job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", false);
        job.getConfiguration().setStrings("mapreduce.reduce.shuffle.input.buffer.percent", "0.01");
        job.getConfiguration().setInt("yarn.scheduler.maximum-allocation-mb",6144);


        job.getConfiguration().setStrings("mapred.child.java.opts","-Xmx8192m");

        job.getConfiguration().setInt("mapreduce.map.memory.mb",3072);
        job.getConfiguration().setInt("mapreduce.reduce.memory.mb",6144);
        job.getConfiguration().setStrings("mapreduce.map.java.opts","-Xmx3072m");
        job.getConfiguration().setStrings("mapreduce.reduce.java.opts","-Xmx6144m");
        job.getConfiguration().setBoolean("yarn.nodemanager.vmem-check-enabled",false);
        job.setNumReduceTasks(30);
        //job.getConfiguration().setInt("yarn.nodemanager.vmem-pmem-ratio",5);
        job.getConfiguration().setStrings("mapreduce.job.jvm.numtasks","-1");


        job.setMapperClass(QLMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        // FileInputFormat.setInputPaths(job, new
        // Path("hdfs://10.1.18.202:9000/data/user_detail_201606"));
        // FileInputFormat.setInputPaths(job, new
        // Path("hdfs://10.1.18.202:9000/data/userAllDetail3M.txt"));
        FileInputFormat.setInputPaths(job, new Path(originalDataPath));

        // FileInputFormat.class, QLMapper.class);
        // MultipleInputs.addInputPath(job, new Path(args[1]),
        // FileInputFormat.class, QLMapper.class);
        //

        job.setCombinerClass(QLCombiner.class);
        job.setReducerClass(QLReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);


        job.setInputFormatClass(TextInputFormat.class);

        //job.setOutputFormatClass(NullOutputFormat.class);

        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
        //job.setOutputFormatClass(TextOutputFormat.class);
        // FileOutputFormat.setOutputPath(job, new
        // Path("hdfs://10.1.18.202:9000/mrOutput/dataclean3mNew2"));
        FileOutputFormat.setOutputPath(job, new Path(dataCleanOutputFile));

        MultipleOutputs.addNamedOutput(job, "iconRecord", TextOutputFormat.class, Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, "AllID", TextOutputFormat.class, Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, "webRecord", TextOutputFormat.class, Text.class, NullWritable.class);


        job.waitForCompletion(true);

        long endTime = System.currentTimeMillis();
        //System.out.println("DataClean Time: " + (endTime - startTime) / 1000f / 60f / 60f + " h");


    }

    public static void main(String[] args) throws Exception {
    //  String originalDataPath = "hdfs://10.1.18.202:9000/recommend/100data";
    //  String dataCleanOutputFile = "hdfs://10.1.18.202:9000/recommend/gameRecommend11.16/dataClean11.29";
        String originalDataPath = "hdfs://pre/user/hdu/data/newRecord/originalData7";
        String dataCleanOutputFile = "hdfs://pre//user/hdu/gamerecommend/dataClean/2016-12-16/step1";
        DataCleanIdIconWeb1.run(originalDataPath, dataCleanOutputFile);
    }

}

0 个答案:

没有答案