使用Apache Spark将MongoDB数据传输到AWS S3存储桶失败

时间:2019-12-20 08:06:56

标签: amazon-web-services apache-spark amazon-s3

我是AWS的新手,试图读取MongoDB集合文档数据并在S3中填充 结合使用Scala和Apache Spark将其存储为实木复合地板格式。我已经尝试了很多天了,所有步骤都可以在Internet上找到,但是我无法将这种简单的MongoDB文档作为镶木地板格式传输到S3存储桶。任何帮助将不胜感激。

spark-submit命令:

bin/spark-submit --packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.7,org.apache.hadoop:hadoop-aws:2.7.7,com.amazonaws:aws-java-sdk-pom:1.11.538 --class com.pb3.mongos3.wd.transform1 /home/user/Desktop/pbjar03.jar mongodb://127.0.0.1/admin.collectionNAME IWJRFIRWJFIWJRWIJRR:JjjoijIJnjonhijijOIJJjijJIjjjjjllLL@mongobucket/

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.pb3.mongos3.wd</groupId>
  <artifactId>test</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>mongos3</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <dependencies>


  <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.11</artifactId>
    <version>2.2.1</version>
</dependency>

  <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-sql_2.11</artifactId>
    <version>2.2.1</version>
</dependency>

  <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive-thriftserver -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-hive-thriftserver_2.11</artifactId>
    <version>2.2.1</version>
    <scope>provided</scope>
</dependency>


  <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-yarn -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-yarn_2.11</artifactId>
    <version>2.2.1</version>
</dependency>


  <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-catalyst -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-catalyst_2.11</artifactId>
    <version>2.2.1</version>
    <scope>test</scope>
</dependency>


  <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-hive_2.11</artifactId>
    <version>2.2.1</version>
    <scope>provided</scope>
</dependency>


  <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-streaming_2.11</artifactId>
    <version>2.2.1</version>
    <scope>provided</scope>
</dependency>


<!-- https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector -->
<dependency>
    <groupId>org.mongodb.spark</groupId>
    <artifactId>mongo-spark-connector_2.10</artifactId>
    <version>2.2.7</version>
</dependency>

    <dependency>
        <groupId>org.mongodb</groupId>
        <artifactId>mongodb-driver-sync</artifactId>
        <version>3.11.2</version>
    </dependency>

    <dependency>
        <groupId>org.mongodb.scala</groupId>
        <artifactId>mongo-scala-driver_2.11</artifactId>
        <version>2.4.0</version>
    </dependency>
<!-- https://mvnrepository.com/artifact/org.mongodb/casbah -->
<dependency>
    <groupId>org.mongodb</groupId>
    <artifactId>casbah_2.11</artifactId>
    <version>3.1.1</version>
    <type>pom</type>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-aws</artifactId>
    <version>2.10.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk -->
<dependency>
    <groupId>com.amazonaws</groupId>
    <artifactId>aws-java-sdk</artifactId>
    <version>1.11.688</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
<dependency>
    <groupId>com.amazonaws</groupId>
    <artifactId>aws-java-sdk-s3</artifactId>
    <version>1.11.688</version>
</dependency>


<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-library -->
<dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-library</artifactId>
    <version>2.11.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.databricks/dbutils-api -->
<dependency>
    <groupId>com.databricks</groupId>
    <artifactId>dbutils-api_2.11</artifactId>
    <version>0.0.4</version>
</dependency>


   <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
</project>

代码:

package com.pb3.mongos3.wd

import sys.process._
import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import com.mongodb.spark._
import org.apache.spark.sql.hive.HiveContext

import com.mongodb.spark.MongoConnector
import com.mongodb.spark.config.WriteConfig

import com.mongodb.casbah.MongoCollection
//import com.mongodb.casbah.MongoConnection
import com.mongodb.casbah.Imports._
import com.mongodb.casbah.gridfs.Imports._


import com.databricks.dbutils_v1.DBUtilsHolder.dbutils
import org.bson.Document
import org.bson.Document._
import org.bson._
import org.bson.types.ObjectId
import jdk.nashorn.internal.ir.annotations.Ignore
import com.amazonaws.auth.AWSStaticCredentialsProvider;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.services.s3.model.S3ObjectInputStream;

object transform1  {
    val sparksessionobject = SparkSession.builder()
      .master("local")
      .appName("MongoSparkConnectorIntro")

      .config("spark.mongodb.auth.uri","mongodb://127.0.0.1:27017/admin")
      .config("spark.mongodb.input.uri", "mongodb://127.0.0.1:27017/admin")
      .config("spark.mongodb.output.uri", "mongodb://127.0.0.1:27017/admin")
      .config("spark.mongodb.input.database", "admin")
      .config("spark.mongodb.input.collection", "collectionNAME")

      .config("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
      .config("spark.hadoop.fs.s3a.access.key", "IWJRFIRWJFIWJRWIJRR")
      .config("spark.hadoop.fs.s3a.secret.key", "JjjoijIJnjonhijijOIJJjijJIjjjjjllLL")
      .config("spark.hadoop.fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
      .enableHiveSupport()
      .getOrCreate()

    sparksessionobject.conf.set("spark.sql.shuffle.partitions", 6)
    sparksessionobject.conf.set("spark.executor.memory", "2g")
    sparksessionobject.conf.set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.7")

  def main(args: Array[String]): Unit = {

    val sc = sparksessionobject

    import com.mongodb.spark._
    import com.mongodb.spark.config._
    import org.bson.Document


val readConfig = ReadConfig(Map("collection" -> "collectionNAME", "readPreference.name" -> "secondaryPreferred"), Some(ReadConfig(sc)))
val customRdd = MongoSpark.load(sparksessionobject, readConfig)

println("COUNT IS "+customRdd.count)
     val accessKeyId = "IWJRFIRWJFIWJRWIJRR"
     val secretAccessKey = "JjjoijIJnjonhijijOIJJjijJIjjjjjllLL"
     println("accessKeyId = "+accessKeyId)
     println("secretAccessKey = "+secretAccessKey)
//     var awsCreds = new BasicAWSCredentials(accessKeyId, secretAccessKey);
//     var s3 = AmazonS3Client.builder().withCredentials(new AWSStaticCredentialsProvider(awsCreds)).withRegion(Regions.AP_SOUTH_1).build();

    sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", "accessKeyId")
    sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", "secretAccessKey")
    sparksessionobject.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
    sparksessionobject.sparkContext.hadoopConfiguration.set("com.amazonaws.services.s3.enableV4", "true")
//    val input = sparksessionobject.read.format("csv").option("header", "true").load(s"s3://$accessKeyId:$secretAccessKey@mongodbloger/temp.csv")

    val input = sparksessionobject.read.format("csv").option("header", "true").load(s"s3a://mongobucket/temp.csv")

println(input)




//      println(customRdd)
//      val destination = s"s3://$s3BucketName"
//      println("Data will be written to: " + destination)
//      println(customRdd)
//      customRdd.write.parquet(destination)
 //     customRdd.write.parquet("/home/user/Public/data")

    sc.stop
  }

}

输出:

....
19/12/20 13:27:47 WARN FileStreamSink: Error while looking for metadata directory.
Exception in thread "main" com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 400, AWS Service: Amazon S3, AWS Request ID: 0AA41B7727ED198F, AWS Error Code: null, AWS Error Message: Bad Request, S3 Extended Request ID: NqqFwJXld6ZHNCo/FEMs6vK3g3yU82fl0ilc3xpgPQitaNQ3sfA0pam7r9jRr60moKOQmcWia5A=
    at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:798)
    at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:421)
    at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:232)
    at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3528)
    at com.amazonaws.services.s3.AmazonS3Client.headBucket(AmazonS3Client.java:1031)
    at com.amazonaws.services.s3.AmazonS3Client.doesBucketExist(AmazonS3Client.java:994)
    at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:297)
    at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2669)
    at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
    at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
    at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
    at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
    at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
    at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
    at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.immutable.List.foreach(List.scala:392)
    at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
    at scala.collection.immutable.List.flatMap(List.scala:355)
    at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
    at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
    at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
    at com.pb3.mongos3.wd.transform1$.main(transform1.scala:103)
    at com.pb3.mongos3.wd.transform1.main(transform1.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
    at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:845)
    at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
    at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
    at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
    at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
19/12/20 13:27:47 INFO SparkContext: Invoking stop() from shutdown hook
19/12/20 13:27:47 INFO MongoClientCache: Closing MongoClient: [127.0.0.1:27017]
19/12/20 13:27:47 INFO connection: Closed connection [connectionId{localValue:2, serverValue:4}] to 127.0.0.1:27017 because the pool has been closed.
19/12/20 13:27:47 INFO SparkUI: Stopped Spark web UI at http://192.168.43.153:4040
19/12/20 13:27:47 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
19/12/20 13:27:47 INFO MemoryStore: MemoryStore cleared
19/12/20 13:27:47 INFO BlockManager: BlockManager stopped
19/12/20 13:27:47 INFO BlockManagerMaster: BlockManagerMaster stopped
19/12/20 13:27:47 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
19/12/20 13:27:47 INFO SparkContext: Successfully stopped SparkContext
19/12/20 13:27:47 INFO ShutdownHookManager: Shutdown hook called
19/12/20 13:27:47 INFO ShutdownHookManager: Deleting directory /tmp/spark-bc23cf8b-44c8-4698-a90d-7d00c3ce217c
19/12/20 13:27:47 INFO ShutdownHookManager: Deleting directory /tmp/spark-f5d03c19-946a-4188-92df-caa67fa46805
root@user:/opt/spark2# 

0 个答案:

没有答案
相关问题