我是斯卡拉的新手;以下代码,不打印df中的值,并且火花没有停止,即使在运行此代码半小时后它仍然继续。
import java.sql.DriverManager
import java.sql.Connection
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext._
import org.apache.spark.sql.SQLContext._
import org.apache.spark.sql._
import java.util.concurrent.TimeUnit
object MysqlTest {
def main(args: Array[String]) {
val prop = new java.util.Properties()
val conf = new SparkConf().setAppName("MysqlDataLoad").setMaster("local")
val sc = new SparkContext(conf)
val sqlcontext = new org.apache.spark.sql.SQLContext(sc)
prop.put("user", "***")
prop.put("password", "*****")
val url = "jdbc:mysql://acb-cluster.cluster-cfdz.us-wt-2.rds.amazonaws.com:3306/gsl"
val df: DataFrame = sqlcontext.read.jdbc(url, "test_20160930_result_prop_alpha", prop)
df.createOrReplaceTempView("gsl")
// Create dataframe of required columns from GSL table
println("********* Data For GSL **********")
val dataFrame2 = sqlcontext.sql("select * from gsl limit 10")
dataFrame2.show()
sc.stop()
}
}
日志:
7/05/31 12:30:51 INFO Executor: Starting executor ID driver on host localhost 17/05/31 12:30:51 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 41593. 17/05/31 12:30:51 INFO NettyBlockTransferService: Server created on 192.168.0.132:41593 17/05/31 12:30:51 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy 17/05/31 12:30:51 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 192.168.0.132, 41593, None) 17/05/31 12:30:51 INFO BlockManagerMasterEndpoint: Registering block manager 192.168.0.132:41593 with 1407.3 MB RAM, BlockManagerId(driver, 192.168.0.132, 41593, None) 17/05/31 12:30:51 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 192.168.0.132, 41593, None) 17/05/31 12:30:51 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 192.168.0.132, 41593, None) 17/05/31 12:30:52 INFO SharedState: Warehouse path is 'file:/home/vna/spark_workspace/sz-dw-etl/spark-warehouse/'. 17/05/31 12:30:57 INFO SparkSqlParser: Parsing command: gsl ********* Data For GSL **********17/05/31 12:30:57 INFO SparkSqlParser: Parsing command: select * from gsl limit 10 17/05/31 12:30:57 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf. 17/05/31 12:30:58 INFO CodeGenerator: Code generated in 320.985934 ms 17/05/31 12:30:58 INFO SparkContext: Starting job: collect at MysqlTest.scala:34 17/05/31 12:30:58 INFO DAGScheduler: Got job 0 (collect at MysqlTest.scala:34) with 1 output partitions 17/05/31 12:30:58 INFO DAGScheduler: Final stage: ResultStage 0 (collect at MysqlTest.scala:34) 17/05/31 12:30:58 INFO DAGScheduler: Parents of final stage: List() 17/05/31 12:30:58 INFO DAGScheduler: Missing parents: List() 17/05/31 12:30:58 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[3] at collect at MysqlTest.scala:34), which has no missing parents 17/05/31 12:30:58 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 14.8 KB, free 1407.3 MB) 17/05/31 12:30:58 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 6.2 KB, free 1407.3 MB) 17/05/31 12:30:58 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on 192.168.0.132:41593 (size: 6.2 KB, free: 1407.3 MB) 17/05/31 12:30:58 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:996 17/05/31 12:30:58 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[3] at collect at MysqlTest.scala:34) 17/05/31 12:30:58 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks 17/05/31 12:30:58 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, executor driver, partition 0, PROCESS_LOCAL, 5723 bytes) 17/05/31 12:30:58 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
编辑:通过从数据库中获取另一个较小的表,这将返回结果。
不确定为什么即使我将查询限制为10条记录,它仍然需要失败。
因为我在本地(12 gb机器)上运行了一个火花簇,是否需要更多内存才能运行?我试图运行的只是一个10记录查询。 (由SCALA IDE运行)
我想要获取的表的更多细节是:它的44 gb,有100000000条记录。但是我的查询显然限制了它获取10条记录而没有任何排序。
答案 0 :(得分:3)
尝试做这样的事情:
val properties = new Properties()
properties.put("user", "root")
properties.put("password", "123456")
val url = "jdbc:mysql://localhost:3306/sakila"
val df = spark.read.jdbc(url,"actor",properties = properties)
确保连接器已加载。
libraryDependencies += "mysql" % "mysql-connector-java" % "5.1.49"