类型不匹配:无法从元素类型对象转换为LongWritable

时间:2017-06-09 09:45:02

标签: java eclipse ubuntu hadoop mapreduce

/*this is my reducer class. For line "(LongWritable value : values)"--
  I am getting error. I am creating project for PDF conversion into TEXT 
Reducer Class ***/

        import java.io.IOException;


        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Reducer;

        public class WordCountReducer extends Reducer {
            protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
                int sum = 0;
                for (LongWritable value : values) {
                    sum += value.get();

                }
                context.write(key, new LongWritable(sum));
            }
        }

//Mapper Class
        import java.io.IOException;
        import java.util.StringTokenizer;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Mapper;
        import org.apache.hadoop.mapreduce.TaskAttemptContext;

        public class WordCountMapper extends
                Mapper {
            private Text word = new Text();
            private final static LongWritable one = new LongWritable(1);

            protected void map(LongWritable key, Text value, Context context)
                    throws IOException, InterruptedException {
                String line = value.toString();
                StringTokenizer tokenizer = new StringTokenizer(line);
                while (tokenizer.hasMoreTokens()) {
                    word.set(tokenizer.nextToken());
                    context.progress();
                    context.write(word, one);
                }
            }
        }

//PDF Record Reder class
        import java.io.IOException;
        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.fs.FSDataInputStream;
        import org.apache.hadoop.fs.FileSystem;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.InputSplit;
        import org.apache.hadoop.mapreduce.RecordReader;
        import org.apache.hadoop.mapreduce.TaskAttemptContext;
        import org.apache.hadoop.mapreduce.lib.input.FileSplit;
        import org.apache.pdfbox.pdmodel.PDDocument;
        import org.apache.pdfbox.util.PDFTextStripper;

        public class PdfRecordReader extends RecordReader {

            private String[] lines = null;
            private LongWritable key = null;
            private Text value = null;

            @Override
            public void initialize(InputSplit genericSplit, TaskAttemptContext context)
                    throws IOException, InterruptedException {

                FileSplit split = (FileSplit) genericSplit;
                Configuration job = context.getConfiguration();
                final Path file = split.getPath();

                /*
                 * The below code contains the logic for opening the file and seek to
                 * the start of the split. Here we are applying the Pdf Parsing logic
                 */

                FileSystem fs = file.getFileSystem(job);
                FSDataInputStream fileIn = fs.open(split.getPath());
                PDDocument pdf = null;
                String parsedText = null;
                PDFTextStripper stripper;
                pdf = PDDocument.load(fileIn);
                stripper = new PDFTextStripper();
                parsedText = stripper.getText(pdf);
                this.lines = parsedText.split("\n");
                }

            @Override
            public boolean nextKeyValue() throws IOException, InterruptedException {

                if (key == null) {
                    key = new LongWritable();
                    key.set(1);
                    value = new Text();
                    value.set(lines[0]);
                } else {
                    int temp = (int) key.get();
                    if (temp < (lines.length - 1)) {
                        int count = (int) key.get();
                        value = new Text();
                        value.set(lines[count]);
                        count = count + 1;
                        key = new LongWritable(count);
                    } else {
                        return false;
                    }

                }
                if (key == null || value == null) {
                    return false;
                } else {
                    return true;
                }
            }

            @Override
            public LongWritable getCurrentKey() throws IOException,
                    InterruptedException {

                return key;
            }

            @Override
            public Text getCurrentValue() throws IOException, InterruptedException {

                return value;
            }

            @Override
            public float getProgress() throws IOException, InterruptedException {

                return 0;
            }

            @Override
            public void close() throws IOException {

            }

        }

        //Pdf_Input_Format class
        import java.io.IOException;

        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.InputSplit;
        import org.apache.hadoop.mapreduce.RecordReader;
        import org.apache.hadoop.mapreduce.TaskAttemptContext;
        import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

        public class PdfInputFormat extends FileInputFormat {

            @Override
            public RecordReader createRecordReader(
                    InputSplit split, TaskAttemptContext context) throws IOException,
                    InterruptedException {

                return new PdfRecordReader();
            }

        }

        //Pdf_Input_Driver class
        import java.io.IOException;

        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Job;
        import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
        import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
        import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
        import org.apache.hadoop.util.GenericOptionsParser;

        public class PdfInputDriver {

            public static void main(String[] args) throws IOException,
                    InterruptedException, ClassNotFoundException {
                Configuration conf = new Configuration();
                GenericOptionsParser parser = new GenericOptionsParser(conf, args);
                args = parser.getRemainingArgs();
                Job job = new Job(conf, "Pdfwordcount");
                job.setJarByClass(PdfInputDriver.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(LongWritable.class);
                job.setInputFormatClass(PdfInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);
                FileInputFormat.setInputPaths(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));
                job.setMapperClass(WordCountMapper.class);
                job.setReducerClass(WordCountReducer.class);

                System.out.println(job.waitForCompletion(true));
            }
        }

除了reducer之外,每个班级都工作得很好。如果我改变:

protected void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException

protected void reduce(Text key, Iterable<LongWritable> values,Context context) throws IOException, InterruptedException

进入reducer类而不是项目编译成功,但是当我将导出的JAR运行到hadoop(UBUNTU)时             使用命令hadoop -jar我收到此错误:

        17/06/09 13:31:31 INFO mapred.LocalJobRunner: map task executor complete.
        17/06/09 13:31:31 WARN mapred.LocalJobRunner: job_local306343177_0001
        java.lang.Exception: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
            at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:489)
            at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:549)
        Caused by: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
            at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
            at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
            at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
            at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
            at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:125)
            at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
            at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
            at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
            at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:270)
            at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
            at java.util.concurrent.FutureTask.run(FutureTask.java:266)
            at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
            at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
            at java.lang.Thread.run(Thread.java:745)
        17/06/09 13:31:31 INFO mapreduce.Job: Job job_local306343177_0001 running in uber mode : false
        17/06/09 13:31:31 INFO mapreduce.Job:  map 0% reduce 0%
        17/06/09 13:31:31 INFO mapreduce.Job: Job job_local306343177_0001 failed with state FAILED due to: NA
        17/06/09 13:31:31 INFO mapreduce.Job: Counters: 0
        false
        java.lang.Throwable: Warning: You did not close the PDF Document
            at org.apache.pdfbox.cos.COSDocument.finalize(COSDocument.java:404)
            at java.lang.System$2.invokeFinalize(System.java:1270)
            at java.lang.ref.Finalizer.runFinalizer(Finalizer.java:98)
            at java.lang.ref.Finalizer.access$100(Finalizer.java:34)
            at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:210)

/*My aim is to use PDF as a input from HDFS and convert it into text so  it can be useful for hadoop ecosystem, any new suggestion will also appreciable*/

0 个答案:

没有答案