我正在尝试从map-reduce作业中编写一个snappy块压缩序列文件。我在用 hadoop 2.0.0-cdh4.5.0和snappy-java 1.0.4.1
这是我的代码:
package jinvestor.jhouse.mr;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.List;
import jinvestor.jhouse.core.House;
import jinvestor.jhouse.core.util.HouseAvroUtil;
import jinvestor.jhouse.download.HBaseHouseDAO;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.VectorWritable;
/**
* Produces mahout vectors from House entries in HBase.
*
* @author Michael Scott Knapp
*
*/
public class HouseVectorizer {
private final Configuration configuration;
private final House minimumHouse;
private final House maximumHouse;
public HouseVectorizer(final Configuration configuration,
final House minimumHouse, final House maximumHouse) {
this.configuration = configuration;
this.minimumHouse = minimumHouse;
this.maximumHouse = maximumHouse;
}
public void vectorize() throws IOException, ClassNotFoundException, InterruptedException {
JobConf jobConf = new JobConf();
jobConf.setMapOutputKeyClass(LongWritable.class);
jobConf.setMapOutputValueClass(VectorWritable.class);
// we want the vectors written straight to HDFS,
// the order does not matter.
jobConf.setNumReduceTasks(0);
Path outputDir = new Path("/home/cloudera/house_vectors");
FileSystem fs = FileSystem.get(configuration);
if (fs.exists(outputDir)) {
fs.delete(outputDir, true);
}
FileOutputFormat.setOutputPath(jobConf, outputDir);
// I want the mappers to know the max and min value
// so they can normalize the data.
// I will add them as properties in the configuration,
// by serializing them with avro.
String minmax = HouseAvroUtil.toBase64String(Arrays.asList(minimumHouse,
maximumHouse));
jobConf.set("minmax", minmax);
Job job = Job.getInstance(jobConf);
Scan scan = new Scan();
scan.addFamily(Bytes.toBytes("data"));
TableMapReduceUtil.initTableMapperJob("homes", scan,
HouseVectorizingMapper.class, LongWritable.class,
VectorWritable.class, job);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(VectorWritable.class);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
SequenceFileOutputFormat.setOutputPath(job, outputDir);
job.getConfiguration().setClass("mapreduce.map.output.compress.codec",
SnappyCodec.class,
CompressionCodec.class);
job.waitForCompletion(true);
}
当我运行它时,我得到了这个:
java.lang.Exception: java.lang.UnsatisfiedLinkError: org.apache.hadoop.util.NativeCodeLoader.buildSupportsSnappy()Z
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:401)
Caused by: java.lang.UnsatisfiedLinkError: org.apache.hadoop.util.NativeCodeLoader.buildSupportsSnappy()Z
at org.apache.hadoop.util.NativeCodeLoader.buildSupportsSnappy(Native Method)
at org.apache.hadoop.io.compress.SnappyCodec.checkNativeCodeLoaded(SnappyCodec.java:62)
at org.apache.hadoop.io.compress.SnappyCodec.getCompressorType(SnappyCodec.java:127)
at org.apache.hadoop.io.compress.CodecPool.getCompressor(CodecPool.java:104)
at org.apache.hadoop.io.compress.CodecPool.getCompressor(CodecPool.java:118)
at org.apache.hadoop.io.SequenceFile$Writer.init(SequenceFile.java:1169)
at org.apache.hadoop.io.SequenceFile$Writer.<init>(SequenceFile.java:1080)
at org.apache.hadoop.io.SequenceFile$BlockCompressWriter.<init>(SequenceFile.java:1400)
at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:274)
at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:527)
at org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.getSequenceWriter(SequenceFileOutputFormat.java:64)
at org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.getRecordWriter(SequenceFileOutputFormat.java:75)
at org.apache.hadoop.mapred.MapTask$NewDirectOutputCollector.<init>(MapTask.java:617)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:737)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:338)
at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:233)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
如果我注释掉这些行,那么我的测试通过了:
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
job.getConfiguration().setClass("mapreduce.map.output.compress.coded",
SnappyCodec.class,
CompressionCodec.class);
但是,我真的想在我的序列文件中使用snappy压缩。有人可以向我解释我做错了吗?
答案 0 :(得分:8)
例如我使用Hortonworks HDP,我的 spark-env.sh
中有以下配置export JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH:/usr/hdp/2.2.0.0-2041/hadoop/lib/native
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/hdp/2.2.0.0-2041/hadoop/lib/native
export SPARK_YARN_USER_ENV="JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH,LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
答案 1 :(得分:2)
检查你的core-site.xml和mapred-site.xml,它们应该包含带库的文件夹的正确属性和路径
芯-site.xml中
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
mapred-site.xml
<property>
<name>mapreduce.map.output.compress</name>
<value>true</value>
</property>
<property>
<name>mapred.map.output.compress.codec</name>
<value>org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<property>
<name>mapreduce.admin.user.env</name>
<value>LD_LIBRARY_PATH=/usr/hdp/2.2.0.0-1084/hadoop/lib/native</value>
</property>
LD_LIBRARY_PATH - 必须包含libsnappy.so的路径。
答案 2 :(得分:1)
我的问题是我的JRE没有包含适当的本机库。这可能是也可能不是因为我将JDK从cloudera的预构建VM切换到JDK 1.7。 snappy .so文件位于hadoop / lib / native目录中,JRE需要拥有它们。将它们添加到类路径似乎无法解决我的问题。我这样解决了:
$ cd /usr/lib/hadoop/lib/native
$ sudo cp *.so /usr/java/latest/jre/lib/amd64/
然后我就可以使用SnappyCodec类了。你的路径可能会有所不同。
这似乎让我接下来的问题:
引起:java.lang.RuntimeException:本机snappy库不可用:尚未加载SnappyCompressor。
仍在努力解决这个问题。
答案 3 :(得分:0)
我需要所有文件,而不仅仅是* .so文件。理想情况下,您还要将文件夹包含在路径中,而不是从那里复制库。在此之后,您需要重新启动MapReduce服务,以便获取新库并可以使用它。
尼科
答案 4 :(得分:0)
从windows \ system32删除hadoop.dll(我手动复制)并设置HADOOP_HOME = \ hadoop-2.6.4 IT WORKS !!!
答案 5 :(得分:0)
就我而言,您可以检查hive-conf文件:mapred-site.xml ,然后检查密钥: mapreduce.admin.user.env 的值,
我在一个新的datanode中对其进行了测试,并在没有本地依赖项(libsnappy.so等)的机器上收到unlinked-buildSnappy错误