我有以下代码
但是,当doWork.run()
调用参数时,如何将参数传递给sc.parallelize()
?是否有更好的方法来组织代码? (sqlContext.read.format("jdbc").options
已被重复,似乎是代码味道)
object ConnTest extends App {
override def main(args: scala.Array[String]): Unit = {
super.main(args)
val date = args(0)
val conf = new SparkConf()
val sc = new SparkContext(conf.setAppName("Test").setMaster("local[*]"))
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val jdbcSqlConn = "jdbc:sqlserver://......;"
val listJob = new ItemListJob(sc, sqlContext, jdbcSqlConn)
val list = listJob.run(date) // Return a list of IDs
val doWork = new DoWork(sc, sqlContext, jdbcSqlConn)
// ??? How to run doWork for each ID in `list`?
val processed = sc.parallelize(doWork.run(...id, date...))
}
}
class ItemList(sc: SparkContext, sqlContext: org.apache.spark.sql.SQLContext, jdbcSqlConn: String) {
def run(date: LocalDate) = {
sqlContext.read.format("jdbc").options(Map(
"driver" -> "com.microsoft.sqlserver.jdbc.SQLServerDriver",
"url" -> jdbcSqlConn,
"dbtable" -> s"dbo.GetList('$date')"
)).load()
}
}
class DoWork(sc: SparkContext, sqlContext: org.apache.spark.sql.SQLContext, jdbcSqlConn: String) {
def run(id: Int, date: LocalDate) = {
// Do work, read data from Sql Server and generate a text file for id
val data = sqlContext.read.format("jdbc").options(Map(
"driver" -> "com.microsoft.sqlserver.jdbc.SQLServerDriver",
"url" -> jdbcSqlConn,
"dbtable" -> s"someFunction('$id')"
)).load()
// output data to a text file
(id, date) // return id, date after it's done
}
}