Question

我有存储在Amazon S3云中的文件。我想部分解析文件。我尝试这种方法来读取文件。但它耗费了太多时间。因为首先它将此文件写入本地文件。 Spark正在使用本地保存的文件解析它。有没有办法直接使用spark从AmazonS3读取文件。

我试过这种方式;

package com.elegant.amazon;

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.S3Object;

public class AmazonS3Spark {

    public static void main(String[] args) throws IOException {
        int numOfLinesRead = 0;
        SparkSession spark = SparkSession.builder().appName("SparkCassandraApp")
                .config("spark.cassandra.connection.host", "localhost")
                .config("spark.cassandra.connection.port", "9042").master("local[*]").getOrCreate();
        String outPutFile = "/home/vipin/mylocalfile.csv";
        FileWriter writer = new FileWriter(outPutFile);
        S3Object s3object = AmazonS3Util.s3Client
                .getObject(new GetObjectRequest("flightdata-em", "2015/On_Time_On_Time_Performance_2015_1.csv"));

        InputStreamReader decoder = new InputStreamReader(s3object.getObjectContent());
        BufferedReader buffered = new BufferedReader(decoder);
        buffered.mark(numOfLinesRead);
        int numOfLinesToRead = 1000;
        String thisLine = null;
        Long startTime = System.currentTimeMillis();
        while ((thisLine = buffered.readLine()) != null && numOfLinesRead < numOfLinesToRead) {
            writer.write(thisLine + '\n');
            numOfLinesRead++;
        }



        Dataset<Row> people = spark.read().csv("/home/vipin/mylocalfile.csv");

        /*Dataset<Row> df = spark.read().option("header", "true").option("treatEmptyValuesAsNulls", "true")
                .option("nullValue", "0").option("delimiter", ",")
                .csv("s3n://AKIAI6OYME7W2QAIIOAA:Cn1XhD72rEIRg7etAjhXFBQLRiverQgsnrtt+CRf@flightdata-em//2015/On_Time_On_Time_Performance_2015_1.csv");*/
        people.printSchema();
        people.show(10000, false);
        System.out.println("Total Time Taken " + (System.currentTimeMillis() - startTime));
    }

}

以这种方式将csv文件写入本地文件然后spark用本地文件解析它。因此性能正在下降。有什么解决方案可以解析它而不用写它吗？

如何从Amazon S3读取文件使用Apache Spark而不将其写入本地文件？

0 个答案: