应用程序的目标是使用spark从mongo中获取数据并创建一个csv文件。下面提到的代码执行相同的操作。
火花版本-2.2.1 Spark Mongo连接器-2.2.1 Spark SQL-2.2.1
public static void main(String[] args) throws IOException {
try {
Config config = new ConfigReader().readConfigFile("application.yml", Config.class);
final String MONGO_CONNECTION_STRING = config.getMongodbConnectionString();
final int PRIOR_EVENT = config.getPriorEvent();
final String TEMP_CSV_DIR = config.getTempCsvDirectory();
final String UPLOAD_BUCKET_NAME = config.getUploadBucketName();
LocalDate previousEventDate = getPreviousEventDate(PRIOR_EVENT);
JavaMongoRDD<Document> rdd = SparkConfiguration.initializeSparkSessionAndGetRDD(MONGO_CONNECTION_STRING);
Dataset<Row> rowDataset = rdd.toDF();
Dataset<Row> not_reconciled = rowDataset
.filter(it -> it.getString(7).equalsIgnoreCase(NOT_RECONCILED))
.filter(cr -> (new LocalDate(cr.getTimestamp(12))).isEqual(previousEventDate));
Dataset<Row> selectedRows = not_reconciled.select(col("_id"),
col("tradingDate").alias("Trading Date"),
col("storeId").alias("Store ID"),
col("tillId").alias("Till ID"),
col("saleTotal").alias("Sale Total"),
col("tenderTotal").alias("Tender Total"),
col("tenderBreakUp").alias("Tender Break Up"),
col("status").alias("Status"));
File file = FileWriter.createFile(TEMP_CSV_DIR + UN_RECONCILED_FILE);
List<Row> rows = selectedRows.collectAsList();
FileWriter.writeStringToFile(rows, file);
AmazonS3Client.uploadFileToAmazonS3Bucket(UPLOAD_BUCKET_NAME, UN_RECONCILED_FILE, TEMP_CSV_DIR + UN_RECONCILED_FILE);
} catch (Exception e) {
e.printStackTrace();
}
}
代码在本地环境中运行良好,但是当通过spark提交提交时,它会在spark机器中中断。收到的错误如下-
> 19/01/14 14:18:27 INFO DAGScheduler: Job 0 finished: treeAggregate at MongoInferSchema.scala:78, took 1.915453 s
Exception in thread "main" java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.ml.source.libsvm.LibSVMFileFormat not a subtype
at java.util.ServiceLoader.fail(Unknown Source)
at java.util.ServiceLoader.access$300(Unknown Source)
at java.util.ServiceLoader$LazyIterator.nextService(Unknown Source)
at java.util.ServiceLoader$LazyIterator.next(Unknown Source)
at java.util.ServiceLoader$1.next(Unknown Source)
at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:43)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:526)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:87)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:87)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:302)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at com.mongodb.spark.MongoSpark.toDF(MongoSpark.scala:608)
at com.mongodb.spark.MongoSpark.toDF(MongoSpark.scala:583)
at com.mongodb.spark.rdd.MongoRDD.toDF(MongoRDD.scala:73)
at com.mongodb.spark.rdd.api.java.JavaMongoRDD.toDF(JavaMongoRDD.scala:57)
at com.tesco.c2c.finance.D2UnreconciledReportingJob.main(D2UnreconciledReportingJob.java:32)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:775)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
19/01/14 14:18:27 INFO SparkContext: Invoking stop() from shutdown hook