我正在尝试从谷歌桶读取文件,虽然我可以通过包含gcs jar进入spark shell时通过spark-shell读取它。在通过spark-submit提交时,它会抛出错误。
Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Splitter.splitToList(Ljava/lang/CharSequence;)Ljava/util/List;
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase$ParentTimestampUpdateIncludePredicate.create(GoogleHadoopFileSystemBase.java:780)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.createOptionsBuilderFromConfig(GoogleHadoopFileSystemBase.java:2130)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.configure(GoogleHadoopFileSystemBase.java:1822)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:1003)
at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.initialize(GoogleHadoopFileSystemBase.java:966)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2689)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2723)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2705)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:407)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:172)
at org.apache.hadoop.mapred.JobConf.getWorkingDirectory(JobConf.java:656)
at org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:440)
at org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:413)
at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$33.apply(SparkContext.scala:1015)
at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$33.apply(SparkContext.scala:1015)
at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
at scala.Option.map(Option.scala:145)
at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:176)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:195)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD.count(RDD.scala:1157)
at com.google.reader.GoogleRead$.main(GoogleRead.scala:41)
at com.google.reader.GoogleRead.main(GoogleRead.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:752)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
我尝试排除番石榴依赖,但我仍然遇到上述错误。下面是我正在使用的build.sbt文件
import sbt.ExclusionRule
name := "GoogleFileReader"
version := "1.0"
scalaVersion := "2.10.4"
libraryDependencies += "org.apache.spark" % "spark-sql_2.10" % "1.6.1" exclude("com.google.guava", "guava")
libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "1.6.0-hadoop2"
assemblyMergeStrategy in assembly := {
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
case PathList("org", "apache", xs @ _*) => MergeStrategy.last
case PathList("com", "google", xs @ _*) => MergeStrategy.last
case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
case "about.html" => MergeStrategy.rename
case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
case "META-INF/mailcap" => MergeStrategy.last
case "META-INF/mimetypes.default" => MergeStrategy.last
case "plugin.properties" => MergeStrategy.last
case "log4j.properties" => MergeStrategy.last
case "pom.properties" => MergeStrategy.last
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
}
答案 0 :(得分:1)
在群集中运行时实际使用的guava库是Hadoop所依赖的旧版本,并且在驱动程序/执行程序启动时提供。它首先出现在类路径中,因此这就是为什么不使用代码所依赖的更新版本的原因;因此,NoSuchMethodError。
您可以尝试此解决方案(http://jhz.name/2016/01/10/spark-classpath.html),它基本上表示应首先检查用户类路径的火花,然后是Hadoop Classpath。它使用以下spark配置参数:
spark.driver.userClassPathFirst=true
spark.executor.userClassPathFirst=true
我很可能会像往常一样遇到其他类型的麻烦(java.lang.LinkageError;因为在这种情况下使用了类加载器)。
帮助我的解决方案是遮蔽我的代码正在使用的番石榴版本,以便它不再与Hadoop所依赖的版本冲突。 看到对此的回应: https://groups.google.com/a/lists.datastax.com/forum/#!topic/spark-connector-user/8GCd8iQPy3c
它包括在build.sbt
中添加此着色规则assemblyShadeRules in assembly := Seq(
ShadeRule.rename("com.google.**" -> "shaded.@1").inAll
)
确保您还更新了com.google的合并策略:
case PathList("shaded", xs @ _*) => MergeStrategy.last
答案 1 :(得分:0)
package com.murphy
import org.apache.spark.sql.SparkSession
object GoogleStorageReader {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("GCS test")
.master("local")
.config("spark.sql.warehouse.dir", "E:\\mobSummaryDataIngestion\\spark-warehouse")
.getOrCreate()
val jsonKeyFile = "E:\\ConnsKPI scripts\\gcstest\\gcstest\\google_key.json"
spark.sparkContext.hadoopConfiguration.set("google.cloud.auth.service.account.json.keyfile", jsonKeyFile)
val df = spark.sqlContext.read
.format("com.crealytics.spark.excel")
.option("sheetName", "Master 18")
.option("useHeader", "true")
.option("treatEmptyValuesAsNulls", "false")
.option("inferSchema", "false")
.option("startColumn", 0)
.option("endColumn", 20)
.option("skipFirstRows", 3)
.load("gs://bucket-for-practice/MAXIS_CELCOM_IT_Masterlist-2018.xlsx"
)
df.show()
}
}
请找到附件的pom文件以供依赖。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.murphy</groupId>
<artifactId>mobSummaryDataIngestion</artifactId>
<version>1.0-SNAPSHOT</version>
<name>mobSummaryDataIngestion</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>com.google.apis</groupId>
<artifactId>google-api-services-storage</artifactId>
<version>v1-rev158-1.25.0</version>
</dependency> <!-- this is a maven cloud storage json api-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-storage</artifactId>
<version>1.92.0</version>
</dependency>
<dependency>
<groupId>com.google.cloud.bigdataoss</groupId>
<artifactId>gcs-connector</artifactId>
<version>hadoop3-2.0.0</version>
</dependency>
<dependency>
<groupId>com.google.cloud.bigdataoss</groupId>
<artifactId>bigquery-connector</artifactId>
<version>hadoop3-1.0.0-RC2</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>27.1-jre</version>
</dependency>
<dependency>
<groupId>com.google.api-client</groupId>
<artifactId>google-api-client</artifactId>
<version>1.30.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.11</artifactId>
<version>2.7.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.crealytics/spark-excel -->
<dependency>
<groupId>com.crealytics</groupId>
<artifactId>spark-excel_2.11</artifactId>
<version>0.9.17</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.20.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.murphy.MobileDashboard</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<!-- see http://davidb.github.com/scala-maven-plugin -->
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<jvmArgs>
<jvmArg>-Xss4m</jvmArg>
<jvmArg>-Xms512m</jvmArg>
<jvmArg>-Xmx4096m</jvmArg>
</jvmArgs>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<!-- get all project dependencies -->
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<!-- MainClass in mainfest make a executable jar -->
<archive>
<manifest>
<mainClass>com.murphy.MobileDashboard</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<!-- bind to the packaging phase -->
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>