用于从数据库中获取表的spark代码

时间:2017-08-01 11:33:51

标签: mysql scala apache-spark dataframe spark-dataframe

我是新来的火花和斯卡拉。 问题是我想获取数据库中所有表的全部数据。 我写了一个可以获取特定表的代码。

但是我可以传递我的数据库名称并从所有表中获取数据吗?

示例:

val driver = "com.mysql.jdbc.Driver"
val url="jdbc:mysql://127.0.0.1:3306/mydb" 
val username = "root"
val password = "*******"


val prop=new Properties()
val conf=new SparkConf().setAppName("Read From SQl").setMaster("local[*]")
val sc=new SparkContext(conf)
val sqlContext=new SQLContext(sc)

prop.setProperty("user", "root")
prop.setProperty("password", "lasitpant")

val df: org.apache.spark.sql.DataFrame =sqlContext.read.jdbc(url, "store_details",prop)

df.createOrReplaceTempView("store_details")
df.schema.printTreeString()

以上代码仅提取特定表格。

  

基本上我想从mysql中提取数据,然后使用spark   sql做一些过滤..只是想知道而不是创建   通过硬编码为每个表的数据帧。还有其他办法吗?   围绕

1 个答案:

答案 0 :(得分:1)

  

information_schema.tables是加载表格列表的方法   在下面的示例程序中解释...您可以尝试以下方法..   来自下面的代码片段

val dfTableList = loadTable(url
      , "information_schema.tables"
      , user
      , pass);

基本步骤

1)首先获取所有表格列表
2)映射每个表以从下面的代码片段中获取数据帧

tableList.map { tableName =>
        val df = loadTable(url
          , schemeName+"."+tableName
          , user
          , pass
          , driverName)

        if (df.isDefined){
          // do what ever you want to do

           df.get.unpersist() //remove form cache
        }
      }

注意:我使用spark 1.6似乎你使用的是> 1.6因此一些语法可能会有所不同,但方法是相同的。

  

最终代码示例如下所示......在下面的例子中   我保存到镶木地板文件。你可以忽略它并实现你的   自定义逻辑...

package yourpackage

import org.apache.spark.sql.{SQLContext,DataFrame}

import java.sql.{Connection, Statement, ResultSet}
import java.util.TimeZone
// add other imports like loggers etc....

/**
 * Allows to access to JDBC database scheme and tables and implements methods 
 * to export schemes and tables to Parquet format
 */
object JdbcExporter {

  /** Current SQLContext */
  private var sqlContext: SQLContext = null;

  /** Stored id driver has been loaded */
  private var JDBC_driver_loaded = false;

  /** Name of the JDBC driver */
  private val DefaultJDBC_DriverName = "com.mysql.jdbc.Driver"

  /** Time zone used to access to JDBC database */
  private val TimeZoneName = java.util.TimeZone.getDefault.getID


  /** Sets the current SQLContext
   * @param _sqlContext Current SQLContext
   */  
  def setSQL_Context(_sqlContext: SQLContext) =
      sqlContext = _sqlContext


  /** Builds a JDBC url to access JDBC databaseto a database using a starting url, user and password.
   *  It fixes the server time zone to 'java.util.TimeZone.getDefault.getID' to avoid
   *  problems when accessing time values
   * @param basicURL Starting url
   * @param user Name of the JDBC database user
   * @param pass Password of the JDBC database user
   * @return A new JDBC url with user and password 
   */      
  private def buildURL(basicURL: String
    , user: String
    , pass: String): String = {

    s"$basicURL?user=$user&password=$pass&useLegacyDatetimeCode=false&serverTimezone=$TimeZoneName"
  }


  /** Loads a JDBC driver
   * @param driverName Name of the driver
   * @return true if has been properly loaded
              or false if cluster has not been properly loaded 
   */  
  def loadDriver(driverName: String) : Boolean = {

    logInfo(s"Loading JDBC driver '$driverName'")

    try{
      Class.forName(driverName).newInstance
      logInfo(s"JDBC driver '$driverName' loaded sucessfully")
      JDBC_driver_loaded = true;
      true
    }
    catch{
      case e: Exception  => {
        logError(e,s"Error loading JDBC driver '$driverName'. ")
        JDBC_driver_loaded = false;
        false
      }
    }
  }


  /** Gets a table from a JDBC database and create a dataframe with it content
   * @param url JDBC url
   * @param tableName Name of the JDBC database table
   * @param user Name of the JDBC database user
   * @param pass Password of the JDBC database user
   * @param driverName Name of the JDBC driver
   * @return true if dataframe has been properly created
             or false if dataframe has not been properly created 
   */
  def loadTable(url: String
    , tableName: String
    , user: String
    , pass: String
    , driverName: String = DefaultJDBC_DriverName) : Option[DataFrame] = {

    logInfo(s"Loading table '$tableName' from '$url'")

    //check if driver is loaded
    if (!JDBC_driver_loaded){
      if (!loadDriver(driverName))
        return None
    }

    var df : DataFrame = null
    try {
      val prop = new java.util.Properties
      prop.setProperty("driver", driverName)

      df = sqlContext.read.jdbc(buildURL(url,user,pass)
        , tableName
        , prop) //connectionProperties

      Some(df)
    }
    catch {
        case e: Exception => logError(e, s"Error loading table from ' $url'")
        None
      }
  }


  /** Gets the name list of the JDBC database tables stored in a JDBC scheme 
   * @param url JDBC url
   * @param tableName Name of the JDBC database table
   * @param user Name of the JDBC database user
   * @param pass Password of the JDBC database user
   * @param driverName Name of the JDBC driver
   * @return Name list of the tables of the scheme
             or None if case of error or empty scheme 
   */
  def getTableListFromScheme(url: String
    , schemeName: String
    , user: String
    , pass: String
    , driverName: String = DefaultJDBC_DriverName): Option[Array[String]] = {

    val dfTableList = loadTable(url
      , "information_schema.tables"
      , user
      , pass);

    if (!dfTableList.isDefined) return None

    val df = dfTableList.get
    val temporalTable = "tableNameList"
    val query = s"select table_name from $temporalTable where table_schema='$schemeName'"
    df.registerTempTable(temporalTable)         //associate a temporal table to the loaded table
    var dfSQL = df.sqlContext.sql(query)        //execute query on temporal table
    scala.util.Try(sqlContext.dropTempTable(temporalTable)) //delete temporal table
    Some(dfSQL.rdd.map(r => r(0).asInstanceOf[String]).collect)  //get the result
  }


  /** Saves a JDBC database table into Parquet format, deleting previous content in the output 
   * @param schemeName Name of the scheme of the table 
   * @param tableName Name of the table
   * @param df Data of the table
   * @param rootPath Path to store the formatted data
   */
  def saveFileAsParquet(schemeName: String
    , tableName: String
    , df: DataFrame
    , rootPath: String) {
/// ExportUtil is custom class for interacting with hdfs and files you can implement your own stuff 
//DFUtil is another helper class you can implement reusable methods here
      val path = ExportUtil.ensureEndWithFileSeparator(rootPath)+tableName
      ExportUtil.deleteDirectory(path)
      ExportUtil.ensureDirectoryExist(path)

      DFUtil.saveDataframeAsParquet(sqlContext
        , df
        , path)
  }


  /** Saves a complete JDBC database scheme into Parquet format
   * @param url JDBC url
   * @param schemeName Name of the scheme 
   * @param user Name of the JDBC database user
   * @param pass Password of the JDBC database user 
   * @param rootPath Path to store the formatted data
   * @param driverName Name of the JDBC driver
   */
  def saveSchemeAsParquet(url: String
    , schemeName: String
    , user: String
    , pass: String
    , rootPath: String
    , driverName: String = DefaultJDBC_DriverName) : Boolean = {

    logInfo(s"Converting into Parquet format scheme: '$schemeName' into root path: '$rootPath'")

    val tableList = getTableListFromScheme(url
      , schemeName
      , user
      , pass
      , driverName)

     if (!tableList.isDefined) return false

    saveTableListAsParquet(url
      , schemeName
      , tableList.get
      , user
      , pass
      , rootPath
      , driverName)

    logInfo(s"Converted into Parquet format scheme: '$schemeName' into root path: '$rootPath'")
  }


  /** Saves a JDBC database table name list into Parquet format
   * @param url JDBC url 
   * @param schemeName Name of the scheme 
   * @param tableList Table name list
   * @param user Name of the JDBC database user
   * @param pass Password of the JDBC database user 
   * @param rootPath Path to store the formatted data
   * @param driverName Name of the JDBC driver
   */
  def saveTableListAsParquet(url: String
    , schemeName: String
    , tableList: Seq[String]
    , user: String
    , pass: String
    , rootPath: String
    , driverName: String = DefaultJDBC_DriverName) : Boolean = {

     val tableListAsString = tableList.mkString(",")
     val count = tableList.size
     logInfo(s"Converting into Parquet a list of $count tables: $tableListAsString into root path: '$rootPath'")

     tableList.map { tableName =>
        val df = loadTable(url
          , schemeName+"."+tableName
          , user
          , pass
          , driverName)

        if (df.isDefined){
          saveFileAsParquet(schemeName
           , tableName
           , df.get
           , ExportUtil.ensureEndWithFileSeparator(rootPath)+schemeName)

           df.get.unpersist() //remove form cache
        }
      }
      logInfo(s"Converted into Parquet a list of $count tables: $tableListAsString into root path: '$rootPath'")
  } 
}