我是新来的火花和斯卡拉。 问题是我想获取数据库中所有表的全部数据。 我写了一个可以获取特定表的代码。
但是我可以传递我的数据库名称并从所有表中获取数据吗?
示例:
val driver = "com.mysql.jdbc.Driver"
val url="jdbc:mysql://127.0.0.1:3306/mydb"
val username = "root"
val password = "*******"
val prop=new Properties()
val conf=new SparkConf().setAppName("Read From SQl").setMaster("local[*]")
val sc=new SparkContext(conf)
val sqlContext=new SQLContext(sc)
prop.setProperty("user", "root")
prop.setProperty("password", "lasitpant")
val df: org.apache.spark.sql.DataFrame =sqlContext.read.jdbc(url, "store_details",prop)
df.createOrReplaceTempView("store_details")
df.schema.printTreeString()
以上代码仅提取特定表格。
基本上我想从mysql中提取数据,然后使用spark sql做一些过滤..只是想知道而不是创建 通过硬编码为每个表的数据帧。还有其他办法吗? 围绕
答案 0 :(得分:1)
information_schema.tables
是加载表格列表的方法 在下面的示例程序中解释...您可以尝试以下方法.. 来自下面的代码片段val dfTableList = loadTable(url , "information_schema.tables" , user , pass);
1)首先获取所有表格列表
2)映射每个表以从下面的代码片段中获取数据帧
tableList.map { tableName =>
val df = loadTable(url
, schemeName+"."+tableName
, user
, pass
, driverName)
if (df.isDefined){
// do what ever you want to do
df.get.unpersist() //remove form cache
}
}
注意:我使用spark 1.6似乎你使用的是> 1.6因此一些语法可能会有所不同,但方法是相同的。
最终代码示例如下所示......在下面的例子中 我保存到镶木地板文件。你可以忽略它并实现你的 自定义逻辑...
package yourpackage
import org.apache.spark.sql.{SQLContext,DataFrame}
import java.sql.{Connection, Statement, ResultSet}
import java.util.TimeZone
// add other imports like loggers etc....
/**
* Allows to access to JDBC database scheme and tables and implements methods
* to export schemes and tables to Parquet format
*/
object JdbcExporter {
/** Current SQLContext */
private var sqlContext: SQLContext = null;
/** Stored id driver has been loaded */
private var JDBC_driver_loaded = false;
/** Name of the JDBC driver */
private val DefaultJDBC_DriverName = "com.mysql.jdbc.Driver"
/** Time zone used to access to JDBC database */
private val TimeZoneName = java.util.TimeZone.getDefault.getID
/** Sets the current SQLContext
* @param _sqlContext Current SQLContext
*/
def setSQL_Context(_sqlContext: SQLContext) =
sqlContext = _sqlContext
/** Builds a JDBC url to access JDBC databaseto a database using a starting url, user and password.
* It fixes the server time zone to 'java.util.TimeZone.getDefault.getID' to avoid
* problems when accessing time values
* @param basicURL Starting url
* @param user Name of the JDBC database user
* @param pass Password of the JDBC database user
* @return A new JDBC url with user and password
*/
private def buildURL(basicURL: String
, user: String
, pass: String): String = {
s"$basicURL?user=$user&password=$pass&useLegacyDatetimeCode=false&serverTimezone=$TimeZoneName"
}
/** Loads a JDBC driver
* @param driverName Name of the driver
* @return true if has been properly loaded
or false if cluster has not been properly loaded
*/
def loadDriver(driverName: String) : Boolean = {
logInfo(s"Loading JDBC driver '$driverName'")
try{
Class.forName(driverName).newInstance
logInfo(s"JDBC driver '$driverName' loaded sucessfully")
JDBC_driver_loaded = true;
true
}
catch{
case e: Exception => {
logError(e,s"Error loading JDBC driver '$driverName'. ")
JDBC_driver_loaded = false;
false
}
}
}
/** Gets a table from a JDBC database and create a dataframe with it content
* @param url JDBC url
* @param tableName Name of the JDBC database table
* @param user Name of the JDBC database user
* @param pass Password of the JDBC database user
* @param driverName Name of the JDBC driver
* @return true if dataframe has been properly created
or false if dataframe has not been properly created
*/
def loadTable(url: String
, tableName: String
, user: String
, pass: String
, driverName: String = DefaultJDBC_DriverName) : Option[DataFrame] = {
logInfo(s"Loading table '$tableName' from '$url'")
//check if driver is loaded
if (!JDBC_driver_loaded){
if (!loadDriver(driverName))
return None
}
var df : DataFrame = null
try {
val prop = new java.util.Properties
prop.setProperty("driver", driverName)
df = sqlContext.read.jdbc(buildURL(url,user,pass)
, tableName
, prop) //connectionProperties
Some(df)
}
catch {
case e: Exception => logError(e, s"Error loading table from ' $url'")
None
}
}
/** Gets the name list of the JDBC database tables stored in a JDBC scheme
* @param url JDBC url
* @param tableName Name of the JDBC database table
* @param user Name of the JDBC database user
* @param pass Password of the JDBC database user
* @param driverName Name of the JDBC driver
* @return Name list of the tables of the scheme
or None if case of error or empty scheme
*/
def getTableListFromScheme(url: String
, schemeName: String
, user: String
, pass: String
, driverName: String = DefaultJDBC_DriverName): Option[Array[String]] = {
val dfTableList = loadTable(url
, "information_schema.tables"
, user
, pass);
if (!dfTableList.isDefined) return None
val df = dfTableList.get
val temporalTable = "tableNameList"
val query = s"select table_name from $temporalTable where table_schema='$schemeName'"
df.registerTempTable(temporalTable) //associate a temporal table to the loaded table
var dfSQL = df.sqlContext.sql(query) //execute query on temporal table
scala.util.Try(sqlContext.dropTempTable(temporalTable)) //delete temporal table
Some(dfSQL.rdd.map(r => r(0).asInstanceOf[String]).collect) //get the result
}
/** Saves a JDBC database table into Parquet format, deleting previous content in the output
* @param schemeName Name of the scheme of the table
* @param tableName Name of the table
* @param df Data of the table
* @param rootPath Path to store the formatted data
*/
def saveFileAsParquet(schemeName: String
, tableName: String
, df: DataFrame
, rootPath: String) {
/// ExportUtil is custom class for interacting with hdfs and files you can implement your own stuff
//DFUtil is another helper class you can implement reusable methods here
val path = ExportUtil.ensureEndWithFileSeparator(rootPath)+tableName
ExportUtil.deleteDirectory(path)
ExportUtil.ensureDirectoryExist(path)
DFUtil.saveDataframeAsParquet(sqlContext
, df
, path)
}
/** Saves a complete JDBC database scheme into Parquet format
* @param url JDBC url
* @param schemeName Name of the scheme
* @param user Name of the JDBC database user
* @param pass Password of the JDBC database user
* @param rootPath Path to store the formatted data
* @param driverName Name of the JDBC driver
*/
def saveSchemeAsParquet(url: String
, schemeName: String
, user: String
, pass: String
, rootPath: String
, driverName: String = DefaultJDBC_DriverName) : Boolean = {
logInfo(s"Converting into Parquet format scheme: '$schemeName' into root path: '$rootPath'")
val tableList = getTableListFromScheme(url
, schemeName
, user
, pass
, driverName)
if (!tableList.isDefined) return false
saveTableListAsParquet(url
, schemeName
, tableList.get
, user
, pass
, rootPath
, driverName)
logInfo(s"Converted into Parquet format scheme: '$schemeName' into root path: '$rootPath'")
}
/** Saves a JDBC database table name list into Parquet format
* @param url JDBC url
* @param schemeName Name of the scheme
* @param tableList Table name list
* @param user Name of the JDBC database user
* @param pass Password of the JDBC database user
* @param rootPath Path to store the formatted data
* @param driverName Name of the JDBC driver
*/
def saveTableListAsParquet(url: String
, schemeName: String
, tableList: Seq[String]
, user: String
, pass: String
, rootPath: String
, driverName: String = DefaultJDBC_DriverName) : Boolean = {
val tableListAsString = tableList.mkString(",")
val count = tableList.size
logInfo(s"Converting into Parquet a list of $count tables: $tableListAsString into root path: '$rootPath'")
tableList.map { tableName =>
val df = loadTable(url
, schemeName+"."+tableName
, user
, pass
, driverName)
if (df.isDefined){
saveFileAsParquet(schemeName
, tableName
, df.get
, ExportUtil.ensureEndWithFileSeparator(rootPath)+schemeName)
df.get.unpersist() //remove form cache
}
}
logInfo(s"Converted into Parquet a list of $count tables: $tableListAsString into root path: '$rootPath'")
}
}