我使用Spark处理一组文件。转换为Spark Dataframe后的结果应保存到数据库中。当Spark以“local [*]”模式运行时,以下代码有效。但是当我使用YARN模式在集群上运行它时,处理结束时没有错误(除了一些these错误)但数据库仍为空。
import java.sql.{Connection, DriverManager, Timestamp, SQLException}
import java.util.Properties
import org.apache.spark.sql.SparkSession
import scala.collection.JavaConverters._
import java.util.Calendar
import scala.collection.mutable.ListBuffer
import com.qbeats.cortex.library.{PartialDateTime, TimeExtractor}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
object CommoncrawlExtractor extends App {
var driver: String = null
var connectionString: String = null
var helper: Helper = null
var sc: SparkContext = null
var pte = sc.broadcast(new TimeExtractor)
def uncertainty = 60 * 60 * 12
case class SectionData(warcinfoID: String, recordID: String, sectionName: Int,
timestamp: Timestamp, uncertainty: Int, wordsets: Array[Array[String]])
case class Word(word: String)
case class Wordset(section_id: Int, wordset: Seq[Int])
def dropFirst(iterator: Iterator[String]): Iterator[String] = {
if (iterator.hasNext) {
iterator.next
}
iterator
}
def extractSentences(entity: String) = {
val result = ListBuffer[(String, String, Int, Timestamp, Int, Array[Array[String]])]()
val warcinfoIDPattern = """WARC-Warcinfo-ID: <urn:uuid:(.+)>""".r
val warcinfoID = warcinfoIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val recordIDPattern = """WARC-Record-ID: <urn:uuid:(.+)>""".r
val recordID = recordIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val requestTimePattern = """WARC-Date: (.+)""".r
val requestTimeString = requestTimePattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val requestTimeFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'")
val requestTime = requestTimeFormat.parse(requestTimeString)
var cal: Calendar = Calendar.getInstance()
cal.setTime(requestTime)
val referenceDate1 = new PartialDateTime(cal, null)
val contentPattern = """(?s)\r\nHTTP/1\.. 200(.+?)(\r\n){2,}(.+)WARC/1.0\r\nWARC-Type: metadata""".r
val contentString = contentPattern.findFirstMatchIn(entity).map(_ group 3).getOrElse("")
try {
val de = pte.value.extractTimes(contentString)
if (de.getEntries != null) {
for (entry <- de.getEntries.asScala) {
val pdt = entry.resolve(12 * 3600, referenceDate1)
if (pdt != null) {
val sectionWordsets = entry.getSentences.asScala.map(x => x.getTokens.asScala.toArray[String]).toArray
val sectionData = (
warcinfoID, recordID, entry.getId,
new Timestamp(pdt.secondsSinceEpoch * 1000), pdt.uncertaintyInterval.toInt, sectionWordsets
)
result += sectionData
}
}
}
} catch {
case e: Exception => println("\n" + "-" * 100 + "\n" + entity)
}
result
}
def initDB() = {
driver = "org.postgresql.Driver"
connectionString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl?user=postgres&password=postgres"
Class.forName(driver)
}
def prepareDB() = {
var conn: Connection = null
try {
conn = DriverManager.getConnection(connectionString)
val statement = conn.createStatement()
val tableResultSet = statement.executeQuery(
"""
|SELECT table_name
| FROM information_schema.tables
| WHERE table_schema='public'
| AND table_type='BASE TABLE';
""".stripMargin)
val tablesToDelete = ListBuffer[String]()
while (tableResultSet.next()) {
tableResultSet.getString("table_name") match {
case "warcinfo" => tablesToDelete.append("warcinfo")
case "record" => tablesToDelete.append("record")
case "section" => tablesToDelete.append("section")
case "word" => tablesToDelete.append("word")
case "wordset" => tablesToDelete.append("wordset")
case _ =>
}
}
for (tableName <- tablesToDelete) statement.executeUpdate("DROP TABLE " + tableName + ";")
val storedProcedureResultSet = statement.executeQuery(
"""
|SELECT proname, prosrc
|FROM pg_catalog.pg_namespace n
|JOIN pg_catalog.pg_proc p
|ON pronamespace = n.oid
|WHERE nspname = 'public';
""".stripMargin)
val storedProcedureDeletions = ListBuffer[String]()
while (storedProcedureResultSet.next()) {
storedProcedureResultSet.getString("proname") match {
case "update_word_ids" =>
storedProcedureDeletions.append("DROP FUNCTION update_word_ids();")
case _ =>
}
}
statement.executeUpdate("DROP TRIGGER IF EXISTS update_word_ids_trigger ON wordset_occurrence;")
for (storedProcedureDeletion <- storedProcedureDeletions) statement.executeUpdate(storedProcedureDeletion)
statement.executeUpdate(
"""
|CREATE TABLE warcinfo (
| warcinfo_id serial PRIMARY KEY,
| batch_name varchar NOT NULL,
| warcinfo_uuid char(36) NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE record (
| record_id serial PRIMARY KEY,
| record_uuid char(36) NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE section (
| section_id serial PRIMARY KEY,
| record_id integer NOT NULL,
| section_name integer NOT NULL,
| timestamp timestamp NOT NULL,
| uncertainty integer NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE word (
| word_id serial PRIMARY KEY,
| word varchar NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE wordset (
| section_id integer NOT NULL,
| wordset integer ARRAY
|);
""".stripMargin)
} catch {
case e: SQLException => println("exception caught: " + e)
} finally {
if (conn != null) conn.close()
}
}
def processFile(fileNames: Array[String], accessKeyId: String = "", secretAccessKey: String = ""): Unit = {
val delimiter = "WARC/1.0\r\nWARC-Type: request\r\n"
pte = sc.broadcast(new TimeExtractor)
val spark = SparkSession
.builder()
.appName("CommoncrawlExtractor")
.getOrCreate()
import spark.implicits._
val connString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl"
val prop = new Properties()
prop.put("user", "postgres")
prop.put("password", "postgres")
val entities = sc.
textFile(fileNames.mkString(",")).
mapPartitions(dropFirst).
map(delimiter + _).
flatMap(extractSentences).
map(x => SectionData(x._1, x._2, x._3, x._4, x._5, x._6)).toDF().
cache()
val warcinfo = entities.select("warcinfoID").distinct().
withColumnRenamed("warcinfoID", "warcinfo_uuid").
withColumn("batch_name", lit("June 2016, batch 1"))
val warcinfoWriter = warcinfo.write.mode("append")
println("Saving warcinfo.")
println(Calendar.getInstance().getTime)
warcinfoWriter.jdbc(connString, "warcinfo", prop)
println(Calendar.getInstance().getTime)
val record = entities.select("recordID").distinct().
withColumnRenamed("recordID", "record_uuid")
val recordWriter = record.write.mode("append")
println("Saving records.")
println(Calendar.getInstance().getTime)
recordWriter.jdbc(connString, "record", prop)
println(Calendar.getInstance().getTime)
val recordFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.record", "user" -> "postgres", "password" -> "postgres")).
load().cache()
val section = entities.
join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
select("record_id", "sectionName", "timestamp", "uncertainty").distinct().
withColumnRenamed("sectionName", "section_name")
val sectionWriter = section.write.mode("append")
println("Saving sections.")
println(Calendar.getInstance().getTime)
sectionWriter.jdbc(connString, "section", prop)
println(Calendar.getInstance().getTime)
val sectionFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.section", "user" -> "postgres", "password" -> "postgres")).
load()
val word = entities.
select("wordsets").
flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").flatten).
distinct().
map(Word(_))
val wordWriter = word.write.mode("append")
wordWriter.jdbc(connString, "word", prop)
val wordFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.word", "user" -> "postgres", "password" -> "postgres")).
load().
map(row => (row.getAs[String]("word"), row.getAs[Int]("word_id"))).
collect().
toMap
val wordsetTemp = entities.
join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
withColumnRenamed("sectionName", "section_name")
val wordset = wordsetTemp.
join(sectionFull, Seq("record_id", "section_name")).
select("section_id", "wordsets").
flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").map(x => Wordset(r.getAs[Int]("section_id"), x.map(wordFull))))
val wordsetWriter = wordset.write.mode("append")
println("Saving wordsets.")
println(Calendar.getInstance().getTime)
wordsetWriter.jdbc(connString, "wordset", prop)
println(Calendar.getInstance().getTime)
// entities.saveAsTextFile(helper.outputDirectory + "xyz")
sc.stop
}
override def main(args: Array[String]): Unit = {
if (args.length >= 2) {
initDB()
prepareDB()
helper = new Helper
val files =
if (args(0).startsWith("hdfs://")) helper.getHDFSFiles(args(0)).slice(0, args(3).toInt)
else helper.getLocalFiles(args(0))
val appName = "CommoncrawlExtractor"
val conf = new SparkConf().setAppName(appName)
if (args(0).startsWith("hdfs://")) {
conf.set("spark.executor.instances", args(1))
conf.set("spark.executor.cores", args(2))
} else conf.setMaster(args(1))
sc = new SparkContext(conf)
val delimiter = "WARC/1.0\r\nWARC-Type: request"
sc.hadoopConfiguration.set("textinputformat.record.delimiter", delimiter)
processFile(files)
}
}
}
我将postgresql-9.4.1209.jre7.jar复制到了claster中每台机器上的/ home / user / Programs / libs,并使用以下命令(从Spark的目录运行):
./bin/spark-submit --master yarn --deploy-mode client --driver-class-path /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --jars /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --conf "spark.driver.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" --conf "spark.executor.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" spark-cortex-fat.jar hdfs://LV-WS10.lviv:9000/commoncrawl 2 4 8
请建议我如何使其在群集上运行。
后来添加:
我发现了这些行
val warcinfo = entities.select("warcinfoID").
withColumnRenamed("warcinfoID", "warcinfo_uuid").
withColumn("batch_name", lit("June 2016, batch 1"))
val warcinfoWriter = warcinfo.write.mode("append")
println("Saving warcinfo.")
println(Calendar.getInstance().getTime)
warcinfoWriter.jdbc(connString, "warcinfo", prop)
println(Calendar.getInstance().getTime)
导致异常
16/09/01 17:31:51 WARN scheduler.TaskSetManager: Lost task 0.1 in stage 1.0 (TID 5, LV-WS09): org.apache.spark.storage.BlockFetchException: Failed to fetch block after 1 fetch failures. Most recent failure cause:
at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:565)
at org.apache.spark.storage.BlockManager.getRemoteValues(BlockManager.scala:522)
at org.apache.spark.storage.BlockManager.get(BlockManager.scala:609)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:661)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:96)
at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:95)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1203)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1325)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1211)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1190)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult:
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:194)
at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:104)
at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:554)
... 31 more
Caused by: java.io.IOException: Failed to connect to ubuntu-cluster-4/192.168.100.139:36378
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:228)
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:179)
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:96)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
... 3 more
Caused by: java.net.ConnectException: Connection refused: ubuntu-cluster-4/192.168.100.139:36378
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717)
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
... 1 more
但是,某些记录存储在数据库中。 你会建议什么?
后来添加:
我查看停止响应的节点上的YARN日志,但它们没有帮助:logs。