我需要实现基于ORCFile I / O格式的自定义I / O格式。我该怎么办呢?
具体来说,我需要一种方法在我的源代码中包含ORCFile库(这是一个自定义的Pig实现),并使用ORCFile输出格式来写入数据,然后使用ORCFile输入格式来读回数据。
答案 0 :(得分:0)
您需要创建InputFormat类(或FileInputFormat的子类,具体取决于文件的性质)。
只需google for Hadoop InputFormat,您就会找到大量有关如何创建自己的InputFormat类的文章和教程。
答案 1 :(得分:0)
您可以使用HCatalog库来读取mapreduce中的写入orc文件。
答案 2 :(得分:0)
刚刚写了一个示例代码here。希望能帮助到你。 示例映射器代码
public static class MyMapper<K extends WritableComparable, V extends Writable>
extends MapReduceBase implements Mapper<K, OrcStruct, Text, IntWritable> {
private StructObjectInspector oip;
private final OrcSerde serde = new OrcSerde();
public void configure(JobConf job) {
Properties table = new Properties();
table.setProperty("columns", "a,b,c");
table.setProperty("columns.types", "int,string,struct<d:int,e:string>");
serde.initialize(job, table);
try {
oip = (StructObjectInspector) serde.getObjectInspector();
} catch (SerDeException e) {
e.printStackTrace();
}
}
public void map(K key, OrcStruct val,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
System.out.println(val);
List<? extends StructField> fields =oip.getAllStructFieldRefs();
StringObjectInspector bInspector =
(StringObjectInspector) fields.get(B_ID).getFieldObjectInspector();
String b = "garbage";
try {
b = bInspector.getPrimitiveJavaObject(oip.getStructFieldData(serde.deserialize(val), fields.get(B_ID)));
} catch (SerDeException e1) {
e1.printStackTrace();
}
OrcStruct struct = null;
try {
struct = (OrcStruct) oip.getStructFieldData(serde.deserialize(val),fields.get(C_ID));
} catch (SerDeException e1) {
e1.printStackTrace();
}
StructObjectInspector cInspector = (StructObjectInspector) fields.get(C_ID).getFieldObjectInspector();
int d = ((IntWritable) cInspector.getStructFieldData(struct, fields.get(D_ID))).get();
String e = cInspector.getStructFieldData(struct, fields.get(E_ID)).toString();
output.collect(new Text(b), new IntWritable(1));
output.collect(new Text(e), new IntWritable(1));
}
}
启动码
JobConf job = new JobConf(new Configuration(), OrcReader.class);
// Specify various job-specific parameters
job.setJobName("myjob");
job.set("mapreduce.framework.name","local");
job.set("fs.default.name","file:///");
job.set("log4j.logger.org.apache.hadoop","INFO");
job.set("log4j.logger.org.apache.hadoop","INFO");
//push down projection columns
job.set("hive.io.file.readcolumn.ids","1,2");
job.set("hive.io.file.read.all.columns","false");
job.set("hive.io.file.readcolumn.names","b,c");
FileInputFormat.setInputPaths(job, new Path("./src/main/resources/000000_0.orc"));
FileOutputFormat.setOutputPath(job, new Path("./target/out1"));
job.setMapperClass(OrcReader.MyMapper.class);
job.setCombinerClass(OrcReader.MyReducer.class);
job.setReducerClass(OrcReader.MyReducer.class);
job.setInputFormat(OrcInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
JobClient.runJob(job);