在Spark-JDBC程序上工作时找不到合适的驱动程序异常

时间:2018-07-31 11:19:12

标签: scala apache-spark apache-spark-sql

我正在尝试使用spark-jdbc读取postgres db上存在的表。为此,我想出了以下代码:

object PartitionRetrieval {
  var conf  = new SparkConf().setAppName("Spark-JDBC").set("spark.executor.heartbeatInterval","120s")
                                                      .set("spark.network.timeout","12000s")
                                                      .set("spark.default.parallelism", "20")
  val conFile       = "testconnection.properties"
  val properties    = new Properties()
  properties.load(new FileInputStream(conFile))
  val connectionUrl = properties.getProperty("gpDevUrl")
  val devUserName   = properties.getProperty("devUserName")
  val devPassword   = properties.getProperty("devPassword")
  val driverClass   = properties.getProperty("gpDriverClass")
  val hiveMetaConURL  = properties.getProperty("hiveMetaDevUrl")
  val metaUserName    = properties.getProperty("hiveMetaDevUser")
  val metaPassword    = properties.getProperty("hiveMetaDevpassword")
  try {
    Class.forName(driverClass).newInstance()
  } catch {
    case cnf: ClassNotFoundException =>
      System.exit(1)
    case e: Exception =>
      System.exit(1)
  }
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().config(conf).master("yarn").enableHiveSupport().getOrCreate()
    import spark.implicits._
    val yearDF = spark.read.format("jdbc").option("url", connectionUrl)
                                                   .option("dbtable", "(select * from src.forecaste where source_system_name='ORACLE' and year='2017') as year2017")
                                                   .option("user", devUserName)
                                                   .option("password", devPassword)
                                                   .option("numPartitions",15)
                                                   .load()
    val dtypes = spark.read.format("jdbc").option("url", hiveMetaConURL)
                                                   .option("dbtable", "(select source_type, hive_type from metadisc.types) as hiveDataTypes")
                                                   .option("user", metaUserName)
                                                   .option("password", metaPassword)
                                                   .load()
    val sourceCols = spark.read.format("jdbc").option("url", hiveMetaConURL)
                                                   .option("dbtable", "(select source_columns from metadisc.metatables where tablename='base.forecaste') as sCols")
                                                   .option("user", metaUserName)
                                                   .option("password", metaPassword)
                                                   .load()
    val yearcount = yearDF.count()
    println("row count: " + yearcount)
    val dataMapper = dtypes.as[(String,String)].collect().toMap
    val sourceColDataTypes:String = sourceCols.rdd.map(_.toSeq.mkString(",")).collect.mkString(",")
    val cleanDataTypes = sourceColDataTypes.split("|")
    println(sourceCols)
    println(cleanDataTypes)
  }
}

当我使用spark-submit提交罐子时,如下所示:

SPARK_MAJOR_VERSION=2 spark-submit --driver-class-path /home/defusr/jars/postgresql-42.1.4.jar --jars /home/defusr/jars/postgresql-42.1.4.jar --master=yarn --deploy-mode=cluster --keytab /home/defusr/defusr.keytab --principal defusr@DEV.COM --files /usr/hdp/current/spark2-client/conf/hive-site.xml,testconnection.properties --name ABCD --class com.yearpartition.obj.PartitionRetrieval yearpartition_2.11-0.1.jar

它表示在“ yearDF”行找不到合适的驱动程序。我在linux目录上的连接属性文件为:

devUserName=dusr
devPassword=password
gpDriverClass=org.postgresql.Driver
gpDevUrl=jdbc:postgresql://xx.xxx.xxx.xxx:1234/base?ssl=true&sslfactory=org.postgresql.ssl.NonValidatingFactory
hiveMetaDevUrl=jdbc:postgresql://xxxxx.xxxxxxxxxxxx.xx-xxxx-x.xxx.xxxxxxxxx.com:1234/schemameta?currentSchema=metadisc
hiveMetaDevUser=metausr
hiveMetaDevpassword=password

以下是日志中的异常消息:

java.sql.SQLException: No suitable driver
    at java.sql.DriverManager.getDriver(DriverManager.java:315)
    at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
    at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$7.apply(JDBCOptions.scala:84)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:83)
    at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:34)
    at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:34)
    at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:309)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
    at com.yearpartition.obj.PartitionRetrieval$.main(PartitionRetrieval.scala:52)
    at com.yearpartition.obj.PartitionRetrieval.main(PartitionRetrieval.scala)

发生在以下行:yearDF

在没有属性文件的情况下,我遇到了同样的错误,经过一些研究,我重新排列了spark-submit中的参数并开始运行,尤其是当参数--driver-class--jars时被保留在一开始。 有人可以让我知道我在这里犯了什么错误,以便我可以纠正它吗?

0 个答案:

没有答案