Question

我写了一个map-reduce程序来实现hadoop中的javaocr。但是映射器不会将图像文件检索为输入。

Mapper类：

  import hadoopocr.OcrMain;

  import java.awt.image.BufferedImage;
  import java.io.ByteArrayInputStream;
  import java.io.IOException;
  import java.security.NoSuchAlgorithmException;

  import javax.imageio.ImageIO;

  import org.apache.hadoop.io.BytesWritable;

  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Mapper;


public class ImageMapper  extends Mapper<Text, BytesWritable, Text, Text> {

    public void map(Text key, BytesWritable value, Context context) throws IOException,InterruptedException {
String cont = null;

        try {
                cont = findText(value.getBytes());
        } catch (NoSuchAlgorithmException e) {
                e.printStackTrace();
                return;
        }
        catch(Exception ex)
        {
            ex.printStackTrace();
        }
        Text dat = new Text(cont);
        System.out.println("hello "+ cont);

context.write(key,dat);
}


static String findText(byte[] imageData) throws NoSuchAlgorithmException,Exception {

    BufferedImage targImage = ImageIO.read(new ByteArrayInputStream(imageData));
    OcrMain om  = new OcrMain();
    String content = om.ExtractTextFromImage(targImage);
    return content;
}
}

减速器类：

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class ImageReducer extends Reducer<Text,Text,Text,Text> {

    public void reduce(Text key, Iterable<Text> values, Context context)
                                            throws IOException, InterruptedException {
            //Key here is the md5 hash while the values are all the image files that
            // are associated with it. for each md5 value we need to take only
            // one file (the first)
            Text content = null;
            for (Text filePath : values) {
                    content = filePath;
                    break;//only the first one
            }
            // In the result file the key will be again the image file path.
            context.write(key,content);
    }
}

主要课程：

package imageProc;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class ImageProc {

    public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();

            //This is the line that makes the hadoop run locally
            //conf.set("mapred.job.tracker", "local");

            String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
            if (otherArgs.length != 2) {
                    System.err.println("Usage: wordcount <in> <out>");
                    System.exit(2);
            }
            Job job = new Job(conf, "hadoop ocr");
            job.setJarByClass(ImageProc.class);
            job.setInputFormatClass(SequenceFileInputFormat.class);
            //job.setInputFormatClass(ByteWritable.class);
            job.setMapperClass(ImageMapper.class);
            job.setReducerClass(ImageReducer.class);
            //job.setNumReduceTasks(2);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
            FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);

    }
}

你能发现这些文件有什么问题。是他们用于将图像文件作为输入上传到mapper类的任何其他替代解决方案。

提前感谢..

将图像作为序列输入到hadoop

0 个答案: