我有存储在Amazon S3云中的文件。我想部分解析文件。我尝试这种方法来读取文件。但它耗费了太多时间。因为首先它将此文件写入本地文件。 Spark正在使用本地保存的文件解析它。 有没有办法直接使用spark从AmazonS3读取文件。
我试过这种方式;
package com.elegant.amazon;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.S3Object;
public class AmazonS3Spark {
public static void main(String[] args) throws IOException {
int numOfLinesRead = 0;
SparkSession spark = SparkSession.builder().appName("SparkCassandraApp")
.config("spark.cassandra.connection.host", "localhost")
.config("spark.cassandra.connection.port", "9042").master("local[*]").getOrCreate();
String outPutFile = "/home/vipin/mylocalfile.csv";
FileWriter writer = new FileWriter(outPutFile);
S3Object s3object = AmazonS3Util.s3Client
.getObject(new GetObjectRequest("flightdata-em", "2015/On_Time_On_Time_Performance_2015_1.csv"));
InputStreamReader decoder = new InputStreamReader(s3object.getObjectContent());
BufferedReader buffered = new BufferedReader(decoder);
buffered.mark(numOfLinesRead);
int numOfLinesToRead = 1000;
String thisLine = null;
Long startTime = System.currentTimeMillis();
while ((thisLine = buffered.readLine()) != null && numOfLinesRead < numOfLinesToRead) {
writer.write(thisLine + '\n');
numOfLinesRead++;
}
Dataset<Row> people = spark.read().csv("/home/vipin/mylocalfile.csv");
/*Dataset<Row> df = spark.read().option("header", "true").option("treatEmptyValuesAsNulls", "true")
.option("nullValue", "0").option("delimiter", ",")
.csv("s3n://AKIAI6OYME7W2QAIIOAA:Cn1XhD72rEIRg7etAjhXFBQLRiverQgsnrtt+CRf@flightdata-em//2015/On_Time_On_Time_Performance_2015_1.csv");*/
people.printSchema();
people.show(10000, false);
System.out.println("Total Time Taken " + (System.currentTimeMillis() - startTime));
}
}
以这种方式将csv文件写入本地文件然后spark用本地文件解析它。因此性能正在下降。 有什么解决方案可以解析它而不用写它吗?