我是MapReduce编程的初学者,编写了以下Java程序,用于在包含1个NameNode和3个DatanNode的Hadoop集群中运行:
package trial;
import java.io.IOException;
import java.util.*;
import java.lang.Iterable;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class Trial
{
public static class MapA extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, OutputCollector<Text,Text> output, Reporter reporter) throws IOException
{
String[] rows = value.toString().split("\r?\n");
for(int i=0;i<rows.length;i++)
{
String[] cols = rows[i].toString().split(",");
String v=cols[0];
for(int j=1;j<cols.length;j++)
{
String k =j+","+cols[j];
output.collect(new Text(k),new Text(v));
}
}
}
}
public static class ReduceA extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text>output, Reporter reporter) throws IOException
{
int count =0;
String[] attr = key.toString().split(",");
List<String> list = new ArrayList<String>();
while(values.hasNext())
{
list.add((values.next()).toString());
count++;
}
String v=Integer.toString(count);
for(String s:list)
{
output.collect(new Text(s),new Text(v));
}
}
}
public static void main(String[] args) throws IOException
{
JobConf conf1 = new JobConf(Trial.class);
conf1.setJobName("Trial");
conf1.setOutputKeyClass(Text.class);
conf1.setOutputValueClass(Text.class);
conf1.setMapperClass(MapA.class);
//conf.setCombinerClass(Combine.class);
conf1.setReducerClass(ReduceA.class);
conf1.setInputFormat(TextInputFormat.class);
conf1.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf1, new Path(args[0]));
FileOutputFormat.setOutputPath(conf1, new Path(args[1]));
JobClient.runJob(conf1);
JobConf conf2 = new JobConf(Final.class);
conf2.setJobName("Final");
conf2.setOutputKeyClass(Text.class);
conf2.setOutputValueClass(Text.class);
conf2.setMapperClass(Final.MapB.class);
//conf.setCombinerClass(Combine.class);
conf2.setReducerClass(Final.ReduceB.class);
conf2.setInputFormat(TextInputFormat.class);
conf2.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf2, new Path(args[1]));
FileOutputFormat.setOutputPath(conf2, new Path(args[2]));
JobClient.runJob(conf2);
}
}
class Final
{
public static class MapB extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, OutputCollector<Text,Text> output, Reporter reporter) throws IOException
{
String[] r = value.toString().split("\r?\n");
String[] p1= new String[5];
for(int i=0;i<r.length;i++)
{
p1 = r[i].split("\t");
output.collect(new Text(p1[0]),new Text(p1[1]));
}
}
}
public static class ReduceB extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
@Override
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text>output, Reporter reporter) throws IOException
{
int sum=0;
while(values.hasNext())
{
String s = (values.next()).toString();
int c=Integer.parseInt(s);
sum+=c;
}
float avf =(float)sum/3;
String count=Float.toString(avf);
output.collect(key,new Text(count));
}
}
}
该程序在如下数据集上运行:
ID1,1,2,3
ID1,1,3,2
ID3,2,2,3
每行都有一个ID,后跟3个以逗号分隔的属性。我的问题是找到每个ID的每个属性的值的频率(如果数据集被视为二维数组,则不是跨行的列),然后总结ID的每个属性的频率并查找平均值。因此上面的数据集:
ID1 : 2+2+2/3=2
ID2 : 2+1+1/3=1.33
ID3 : 1+2+2/3=1.67
以上代码适用于200-500MB等小型数据集。但对于1GB以上的数据集,我收到如下错误:
map 100% reduce 50%
14/04/12 12:33:06 INFO mapred.JobClient: Task Id : attempt_201404121146_0002_r_000001_0, Status : FAILED
Error: Java heap space
attempt_201404121146_0002_r_000001_0: Exception in thread "LeaseRenewer:hdfs@NameNode:8020" java.lang.OutOfMemoryError: Java heap space
attempt_201404121146_0002_r_000001_0: at org.apache.hadoop.hdfs.LeaseRenewer.renew(LeaseRenewer.java:397)
attempt_201404121146_0002_r_000001_0: at org.apache.hadoop.hdfs.LeaseRenewer.run(LeaseRenewer.java:436)
attempt_201404121146_0002_r_000001_0: at org.apache.hadoop.hdfs.LeaseRenewer.access$700(LeaseRenewer.java:70)
attempt_201404121146_0002_r_000001_0: at org.apache.hadoop.hdfs.LeaseRenewer$1.run(LeaseRenewer.java:297)
attempt_201404121146_0002_r_000001_0: at java.lang.Thread.run(Thread.java:662)
attempt_201404121146_0002_r_000001_0: Exception in thread "Thread for syncLogs" java.lang.OutOfMemoryError: Java heap space
attempt_201404121146_0002_r_000001_0: at java.util.AbstractList.iterator(AbstractList.java:273)
attempt_201404121146_0002_r_000001_0: at org.apache.hadoop.mapred.TaskLog.syncLogs(TaskLog.java:363)
attempt_201404121146_0002_r_000001_0: at org.apache.hadoop.mapred.Child$3.run(Child.java:158)
14/04/12 12:33:10 INFO mapred.JobClient: map 100% reduce 33%
14/04/12 12:33:12 INFO mapred.JobClient: Task Id : attempt_201404121146_0002_r_000003_0, Status : FAILED
Error: Java heap space
attempt_201404121146_0002_r_000003_0: log4j:WARN No appenders could be found for logger (org.apache.hadoop.mapred.Task).
attempt_201404121146_0002_r_000003_0: log4j:WARN Please initialize the log4j system properly.
attempt_201404121146_0002_r_000003_0: log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
14/04/12 12:33:15 INFO mapred.JobClient: map 100% reduce 16%
14/04/12 12:33:16 INFO mapred.JobClient: map 100% reduce 18%
14/04/12 12:33:16 INFO mapred.JobClient: Task Id : attempt_201404121146_0002_r_000000_0, Status : FAILED
Error: Java heap space
attempt_201404121146_0002_r_000000_0: Exception in thread "LeaseRenewer:hdfs@NameNode:8020" java.lang.OutOfMemoryError: Java heap space
attempt_201404121146_0002_r_000000_0: at java.lang.StringCoding.set(StringCoding.java:53)
attempt_201404121146_0002_r_000000_0: at java.lang.StringCoding.decode(StringCoding.java:171)
attempt_201404121146_0002_r_000000_0: at java.lang.String.<init>(String.java:443)
attempt_201404121146_0002_r_000000_0: at java.util.jar.Attributes.read(Attributes.java:401)
attempt_201404121146_0002_r_000000_0: at java.util.jar.Manifest.read(Manifest.java:182)
attempt_201404121146_0002_r_000000_0: at java.util.jar.Manifest.<init>(Manifest.java:52)
attempt_201404121146_0002_r_000000_0: at java.util.jar.JarFile.getManifestFromReference(JarFile.java:167)
attempt_201404121146_0002_r_000000_0: at java.util.jar.JarFile.getManifest(JarFile.java:148)
attempt_201404121146_0002_r_000000_0: at sun.misc.URLClassPath$JarLoader$2.getManifest(URLClassPath.java:696)
attempt_201404121146_0002_r_000000_0: at java.net.URLClassLoader.defineClass(URLClassLoader.java:228)
attempt_201404121146_0002_r_000000_0: at java.net.URLClassLoader.access$000(URLClassLoader.java:58)
attempt_201404121146_0002_r_000000_0: at java.net.URLClassLoader$1.run(URLClassLoader.java:197)
attempt_201404121146_0002_r_000000_0: at java.security.AccessController.doPrivileged(Native Method)
attempt_201404121146_0002_r_000000_0: at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
attempt_201404121146_0002_r_000000_0: at java.lang.ClassLoader.loadClass(ClassLoader.java:306)
attempt_201404121146_0002_r_000000_0: at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
attempt_201404121146_0002_r_000000_0: at java.lang.ClassLoader.loadClass(ClassLoader.java:247)
attempt_201404121146_0002_r_000000_0: at org.apache.hadoop.hdfs.LeaseRenewer.renew(LeaseRenewer.java:400)
attempt_201404121146_0002_r_000000_0: at org.apache.hadoop.hdfs.LeaseRenewer.run(LeaseRenewer.java:436)
attempt_201404121146_0002_r_000000_0: at org.apache.hadoop.hdfs.LeaseRenewer.access$700(LeaseRenewer.java:70)
attempt_201404121146_0002_r_000000_0: at org.apache.hadoop.hdfs.LeaseRenewer$1.run(LeaseRenewer.java:297)
attempt_201404121146_0002_r_000000_0: at java.lang.Thread.run(Thread.java:662)
14/04/12 12:33:21 INFO mapred.JobClient: map 100% reduce 20%
我认为我的程序耗费了太多内存,需要进行优化。我甚至尝试通过将我的Java堆空间增加到1024MB来解决这个问题,但我仍然遇到同样的错误。我使用的数据集是1.4GB,其中5cr行有9个属性排除了行ID。由于我的问题是大数据,因此使用小数据测试代码不是解决方案。 Plz你能建议我如何优化我的代码,以便解决内存问题。提前致谢。
答案 0 :(得分:1)
由于无法遍历遍历迭代器的选项,并且您的堆无法处理存储在列表中的大量值,我建议您添加一个中间 MapReduce 步骤,总共给出三个 MapReduce 为您的工作提供的步骤。
我的主张如下:
第1步
Mapper 1 输出attributeID + "," + value => UserID
Reducer 1 计算每个密钥的总计数(attributeID + "," + value
)。首先,它输出从 Mapper 1 收到的attributeID + "," + value => UserID
。其次,它输出"." + attributeID + "," + value => total_count
。将点添加为前缀以确保所有total_counts
首先到达下一个 Reducer 。由于排序阶段,这是有保证的。
第2步
Mapper 2 除了输出它收到的每个输入外什么都不做
Reducer 2 保证首先收到total_counts
。因此,只要它是对应于total_count
的行,它就将其存储在 HashMap (attributeID + "," + value => total_count
)中。因此,只要它开始接收其他行,它所要做的就是从 HashMap 中检索相应的total_count
并输出UserID => total_count
。
请注意,此阶段只应使用一个Reducer ,因此您必须将mapreduce.job.reduces
设置为 1 。您可以在此步骤后将其重置为之前的值。
步骤3
与初始解决方案中的第二个 MapReduce 步骤相同。计算平均值并输出UserID => average
。
这个解决方案非常乐观,因为它假设你的堆可以处理你的 HashMap 。试一试,看看会发生什么。
以下是示例代码:
public class Trial {
public static class MapA extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, OutputCollector<Text,Text> output, Reporter reporter) throws IOException
{
String[] rows = value.toString().split("\r?\n");
for (int i = 0; i < rows.length; i++) {
String[] cols = rows[i].toString().split(",");
String v = cols[0];
for (int j = 1; j < cols.length; j++) {
String k = j + "," + cols[j];
output.collect(new Text(k), new Text(v));
}
}
}
}
public static class ReduceA extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
int count = 0;
while (values.hasNext()) {
output.collect(key, values.next());
count++;
}
output.collect(new Text("." + key),
new Text(count));
}
}
public static class MapB extends MapReduceBase implements Mapper<Text, Text, Text, Text>
{
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
output.collect(key, value);
}
}
public static class ReduceB extends MapReduceBase implements Reducer<Text, Text, Text, Text>
{
private Map<String, Integer> total_count = new HashMap<String, Integer>();
private Set<String> attributes = new HashSet<String>(); // count the distinct number of attributes
public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String rKey = key.toString();
if(rKey.startsWith(".")){
while (values.hasNext()) {
total_count.put(rKey.substring(1), Integer.valueOf(values.next().toString()));
attributes.add(rKey.substring(1).split(",")[0]);
return;
}
}
while (values.hasNext()) {
Text value = values.next();
output.collect(value, new Text(Integer.toString(total_count.get(rKey))));
output.collect(value, new Text("." + attributes.size())); // send the total number of attributes
}
}
}
public static class MapC extends MapReduceBase implements Mapper<Text, Text, Text, Text>
{
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
output.collect(key, value);
}
}
public static class ReduceC extends MapReduceBase implements Reducer<Text, Text, Text, DoubleWritable>
{
@Override
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, DoubleWritable>output, Reporter reporter) throws IOException
{
long sum = 0;
int nbAttributes = 0;
while(values.hasNext()){
String value = values.next();
if(value.startsWith(".")){ // check if line corresponds to the total number of attributes
nbAttributes = Integer.parseInt(value.substring(1));
} else{
sum += Integer.parseInt(value);
}
}
output.collect(key, new DoubleWritable(sum / nbAttributes));
}
}
}