从hdfs&读取zip文件使用spark java提取

时间:2017-12-07 11:01:45

标签: java hadoop apache-spark hdfs

我在hdfs:// localhost:8020 / sample.zip上有一个zip。我需要阅读&将其解压缩到目录hdfs:// localhost:8020 / sample。

1 个答案:

答案 0 :(得分:1)

以下代码可能有效 zipLoc是ZIP文件的位置 hdfsBasePath是用于写入文件的hdfs目录

public void readWriteZipContents(String zipLoc,String hdfsBasePath){
JavaSparkContext jsc = new JavaSparkContext(new SparkContext(new SparkConf()));
JavaPairRDD<String, PortableDataStream> zipFilesRdd = jsc.binaryFiles(zipLoc);
zipFilesRdd.collect().forEach(file -> {
    ZipInputStream zipStream = new ZipInputStream(file._2.open());
    ZipEntry zipEntry = null;
    Scanner sc = new Scanner(zipStream);
    try {
        while ((zipEntry = zipStream.getNextEntry()) != null) {
            String entryName = zipEntry.getName();
            if (!zipEntry.isDirectory()) {
                //create the path in hdfs and write its contents
               Configuration configuration = new Configuration();
                configuration.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
                configuration.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
                FileSystem fs = FileSystem.get(URI.create("hdfs://localhost:8020"), configuration);
                FSDataOutputStream hdfsfile = fs.create(new Path(hdfsBasePath + "/" + entryName));
               while(sc.hasNextLine()){
                   hdfsfile.writeBytes(sc.nextLine());
               }
               hdfsfile.close();
               hdfsfile.flush();
            }
            zipStream.closeEntry();
        }
    } catch (IllegalArgumentException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    sc.close();
    //return fileNames.iterator();
});

}