Question

package lab.dummy;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat; 
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.lang.*;

public class PatCitedCount extends Configured implements Tool
{
public static class MapClass extends MapReduceBase implements Mapper<Text, Text, IntWritable, IntWritable>
{ 

    private final static IntWritable uno = new IntWritable(1);

    private IntWritable citationCount= new IntWritable();

    public void map(Text key, Text value,

    OutputCollector<IntWritable, IntWritable> output, 
    Reporter reporter ) throws IOException
    {

    citationCount.set(Integer.parseInt(value.toString()));

    output.collect(citationCount,uno);  
    }
 }
 public static class Reduce extends MapReduceBase 

implements Reducer<IntWritable, IntWritable, IntWritable, IntWritable>
{ 
public void reduce(IntWritable key, Iterator<IntWritable> values,
OutputCollector<IntWritable, IntWritable> output, Reporter reporter )
throws IOException
    {

    int count= 0;

    while(values.hasNext())
        {
        count+=values.next().get();
        }
    output.collect(key, new IntWritable(count));
        }
}
public int run(String[] args) throws Exception
    {

    Configuration conf = getConf();

    JobConf job = new JobConf( conf, PatCitedCount.class );

    job.setJarByClass(getClass());

    Path in = new Path( args[0] );

    Path out = new Path( args[1] );

    FileInputFormat.setInputPaths( job, in );

    FileOutputFormat.setOutputPath( job, out );

    job.setJobName( "PatCitedCount" );

    job.setMapperClass( MapClass.class );

    job.setReducerClass( Reduce.class );

    job.setInputFormat(KeyValueTextInputFormat.class);

    job.setOutputFormat( TextOutputFormat.class );

    job.setOutputKeyClass( IntWritable.class );

    job.setOutputValueClass( IntWritable.class );

    JobClient.runJob(job);

    return 0;

        }

    public static void main(String[] args) throws Exception
{

int res= ToolRunner.run(new Configuration(), new PatCitedCount(), args);

System.exit(res);
}
}

我收到以下错误：

java.lang.NumberFormatException: For input string: ""
-at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
-at java.lang.Integer.parseInt(Integer.java:504)
-at java.lang.Integer.parseInt(Integer.java:527)
-at lab.dummy.PatCitedCount$MapClass.map(PatCitedCount.java:60)
-at lab.dummy.PatCitedCount$MapClass.map(PatCitedCount.java:1)
-at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54)
-at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:450)
-at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
-at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
-at java.security.AccessController.doPrivileged(Native Method)
-at javax.security.auth.Subject.doAs(Subject.java:415)
-at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1614)
-at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:163)

Answer 1

您的映射器从hdfs读取的文件中有一些空值。您正在尝试将该空值转换为无效的数字，因此您将获得NumberFormatException。

您需要检查并删除这些行，或者在映射器中需要手动检查以下内容：

try {
    citationCount.set(Integer.parseInt(value.toString()));
    output.collect(citationCount,uno);
} catch (NumberFormatException nfe) {
    //handle exception if you want.
}

Answer 2

使用上一个作业（citation_count）结果作为输入而不是文件＆＃34; cite75_99.txt＆＃34;。一切都会好起来的。：d

MapReduce（hadoop）程序中的错误 - 引用Histogram

2 个答案: