我写了一个map-reduce程序来实现hadoop中的javaocr。 但是映射器不会将图像文件检索为输入。
Mapper类:
import hadoopocr.OcrMain;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.security.NoSuchAlgorithmException;
import javax.imageio.ImageIO;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ImageMapper extends Mapper<Text, BytesWritable, Text, Text> {
public void map(Text key, BytesWritable value, Context context) throws IOException,InterruptedException {
String cont = null;
try {
cont = findText(value.getBytes());
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return;
}
catch(Exception ex)
{
ex.printStackTrace();
}
Text dat = new Text(cont);
System.out.println("hello "+ cont);
context.write(key,dat);
}
static String findText(byte[] imageData) throws NoSuchAlgorithmException,Exception {
BufferedImage targImage = ImageIO.read(new ByteArrayInputStream(imageData));
OcrMain om = new OcrMain();
String content = om.ExtractTextFromImage(targImage);
return content;
}
}
减速器类:
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class ImageReducer extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//Key here is the md5 hash while the values are all the image files that
// are associated with it. for each md5 value we need to take only
// one file (the first)
Text content = null;
for (Text filePath : values) {
content = filePath;
break;//only the first one
}
// In the result file the key will be again the image file path.
context.write(key,content);
}
}
主要课程:
package imageProc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class ImageProc {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//This is the line that makes the hadoop run locally
//conf.set("mapred.job.tracker", "local");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "hadoop ocr");
job.setJarByClass(ImageProc.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
//job.setInputFormatClass(ByteWritable.class);
job.setMapperClass(ImageMapper.class);
job.setReducerClass(ImageReducer.class);
//job.setNumReduceTasks(2);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
你能发现这些文件有什么问题。 是他们用于将图像文件作为输入上传到mapper类的任何其他替代解决方案。
提前感谢..