我是AWS的新手,试图读取MongoDB集合文档数据并在S3中填充 结合使用Scala和Apache Spark将其存储为实木复合地板格式。我已经尝试了很多天了,所有步骤都可以在Internet上找到,但是我无法将这种简单的MongoDB文档作为镶木地板格式传输到S3存储桶。任何帮助将不胜感激。
spark-submit命令:
bin/spark-submit --packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.7,org.apache.hadoop:hadoop-aws:2.7.7,com.amazonaws:aws-java-sdk-pom:1.11.538 --class com.pb3.mongos3.wd.transform1 /home/user/Desktop/pbjar03.jar mongodb://127.0.0.1/admin.collectionNAME IWJRFIRWJFIWJRWIJRR:JjjoijIJnjonhijijOIJJjijJIjjjjjllLL@mongobucket/
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.pb3.mongos3.wd</groupId>
<artifactId>test</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>mongos3</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive-thriftserver -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive-thriftserver_2.11</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-yarn -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-catalyst -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.11</artifactId>
<version>2.2.1</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector -->
<dependency>
<groupId>org.mongodb.spark</groupId>
<artifactId>mongo-spark-connector_2.10</artifactId>
<version>2.2.7</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongodb-driver-sync</artifactId>
<version>3.11.2</version>
</dependency>
<dependency>
<groupId>org.mongodb.scala</groupId>
<artifactId>mongo-scala-driver_2.11</artifactId>
<version>2.4.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.mongodb/casbah -->
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>casbah_2.11</artifactId>
<version>3.1.1</version>
<type>pom</type>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>2.10.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk -->
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk</artifactId>
<version>1.11.688</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>1.11.688</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.databricks/dbutils-api -->
<dependency>
<groupId>com.databricks</groupId>
<artifactId>dbutils-api_2.11</artifactId>
<version>0.0.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
代码:
package com.pb3.mongos3.wd
import sys.process._
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import com.mongodb.spark._
import org.apache.spark.sql.hive.HiveContext
import com.mongodb.spark.MongoConnector
import com.mongodb.spark.config.WriteConfig
import com.mongodb.casbah.MongoCollection
//import com.mongodb.casbah.MongoConnection
import com.mongodb.casbah.Imports._
import com.mongodb.casbah.gridfs.Imports._
import com.databricks.dbutils_v1.DBUtilsHolder.dbutils
import org.bson.Document
import org.bson.Document._
import org.bson._
import org.bson.types.ObjectId
import jdk.nashorn.internal.ir.annotations.Ignore
import com.amazonaws.auth.AWSStaticCredentialsProvider;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.services.s3.model.S3ObjectInputStream;
object transform1 {
val sparksessionobject = SparkSession.builder()
.master("local")
.appName("MongoSparkConnectorIntro")
.config("spark.mongodb.auth.uri","mongodb://127.0.0.1:27017/admin")
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1:27017/admin")
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1:27017/admin")
.config("spark.mongodb.input.database", "admin")
.config("spark.mongodb.input.collection", "collectionNAME")
.config("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
.config("spark.hadoop.fs.s3a.access.key", "IWJRFIRWJFIWJRWIJRR")
.config("spark.hadoop.fs.s3a.secret.key", "JjjoijIJnjonhijijOIJJjijJIjjjjjllLL")
.config("spark.hadoop.fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
.enableHiveSupport()
.getOrCreate()
sparksessionobject.conf.set("spark.sql.shuffle.partitions", 6)
sparksessionobject.conf.set("spark.executor.memory", "2g")
sparksessionobject.conf.set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.7")
def main(args: Array[String]): Unit = {
val sc = sparksessionobject
import com.mongodb.spark._
import com.mongodb.spark.config._
import org.bson.Document
val readConfig = ReadConfig(Map("collection" -> "collectionNAME", "readPreference.name" -> "secondaryPreferred"), Some(ReadConfig(sc)))
val customRdd = MongoSpark.load(sparksessionobject, readConfig)
println("COUNT IS "+customRdd.count)
val accessKeyId = "IWJRFIRWJFIWJRWIJRR"
val secretAccessKey = "JjjoijIJnjonhijijOIJJjijJIjjjjjllLL"
println("accessKeyId = "+accessKeyId)
println("secretAccessKey = "+secretAccessKey)
// var awsCreds = new BasicAWSCredentials(accessKeyId, secretAccessKey);
// var s3 = AmazonS3Client.builder().withCredentials(new AWSStaticCredentialsProvider(awsCreds)).withRegion(Regions.AP_SOUTH_1).build();
sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", "accessKeyId")
sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", "secretAccessKey")
sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
sparksessionobject.sparkContext.hadoopConfiguration.set("com.amazonaws.services.s3.enableV4", "true")
// val input = sparksessionobject.read.format("csv").option("header", "true").load(s"s3://$accessKeyId:$secretAccessKey@mongodbloger/temp.csv")
val input = sparksessionobject.read.format("csv").option("header", "true").load(s"s3a://mongobucket/temp.csv")
println(input)
// println(customRdd)
// val destination = s"s3://$s3BucketName"
// println("Data will be written to: " + destination)
// println(customRdd)
// customRdd.write.parquet(destination)
// customRdd.write.parquet("/home/user/Public/data")
sc.stop
}
}
输出:
....
19/12/20 13:27:47 WARN FileStreamSink: Error while looking for metadata directory.
Exception in thread "main" com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 400, AWS Service: Amazon S3, AWS Request ID: 0AA41B7727ED198F, AWS Error Code: null, AWS Error Message: Bad Request, S3 Extended Request ID: NqqFwJXld6ZHNCo/FEMs6vK3g3yU82fl0ilc3xpgPQitaNQ3sfA0pam7r9jRr60moKOQmcWia5A=
at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:798)
at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:421)
at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:232)
at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3528)
at com.amazonaws.services.s3.AmazonS3Client.headBucket(AmazonS3Client.java:1031)
at com.amazonaws.services.s3.AmazonS3Client.doesBucketExist(AmazonS3Client.java:994)
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:297)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2669)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at com.pb3.mongos3.wd.transform1$.main(transform1.scala:103)
at com.pb3.mongos3.wd.transform1.main(transform1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:845)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
19/12/20 13:27:47 INFO SparkContext: Invoking stop() from shutdown hook
19/12/20 13:27:47 INFO MongoClientCache: Closing MongoClient: [127.0.0.1:27017]
19/12/20 13:27:47 INFO connection: Closed connection [connectionId{localValue:2, serverValue:4}] to 127.0.0.1:27017 because the pool has been closed.
19/12/20 13:27:47 INFO SparkUI: Stopped Spark web UI at http://192.168.43.153:4040
19/12/20 13:27:47 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
19/12/20 13:27:47 INFO MemoryStore: MemoryStore cleared
19/12/20 13:27:47 INFO BlockManager: BlockManager stopped
19/12/20 13:27:47 INFO BlockManagerMaster: BlockManagerMaster stopped
19/12/20 13:27:47 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
19/12/20 13:27:47 INFO SparkContext: Successfully stopped SparkContext
19/12/20 13:27:47 INFO ShutdownHookManager: Shutdown hook called
19/12/20 13:27:47 INFO ShutdownHookManager: Deleting directory /tmp/spark-bc23cf8b-44c8-4698-a90d-7d00c3ce217c
19/12/20 13:27:47 INFO ShutdownHookManager: Deleting directory /tmp/spark-f5d03c19-946a-4188-92df-caa67fa46805
root@user:/opt/spark2#