通过SparkSQL将Spark DataFrames保存到数据库中的工作方式是“local [*]”,但不能处于YARN模式

时间:2016-08-26 18:28:14

标签: scala apache-spark cluster-computing apache-spark-sql yarn

我使用Spark处理一组文件。转换为Spark Dataframe后的结果应保存到数据库中。当Spark以“local [*]”模式运行时,以下代码有效。但是当我使用YARN模式在集群上运行它时,处理结束时没有错误(除了一些these错误)但数据库仍为空。

import java.sql.{Connection, DriverManager, Timestamp, SQLException}
import java.util.Properties
import org.apache.spark.sql.SparkSession

import scala.collection.JavaConverters._
import java.util.Calendar

import scala.collection.mutable.ListBuffer
import com.qbeats.cortex.library.{PartialDateTime, TimeExtractor}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._

object CommoncrawlExtractor extends App {
  var driver: String = null
  var connectionString: String = null
  var helper: Helper = null
  var sc: SparkContext = null
  var pte = sc.broadcast(new TimeExtractor)
  def uncertainty = 60 * 60 * 12

  case class SectionData(warcinfoID: String, recordID: String, sectionName: Int,
                     timestamp: Timestamp, uncertainty: Int, wordsets: Array[Array[String]])

  case class Word(word: String)

  case class Wordset(section_id: Int, wordset: Seq[Int])

  def dropFirst(iterator: Iterator[String]): Iterator[String] = {
    if (iterator.hasNext) {
      iterator.next
    }
    iterator
  }

  def extractSentences(entity: String) = {
    val result = ListBuffer[(String, String, Int, Timestamp, Int, Array[Array[String]])]()

    val warcinfoIDPattern = """WARC-Warcinfo-ID: <urn:uuid:(.+)>""".r
    val warcinfoID = warcinfoIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
    val recordIDPattern = """WARC-Record-ID: <urn:uuid:(.+)>""".r
    val recordID = recordIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")

    val requestTimePattern = """WARC-Date: (.+)""".r
    val requestTimeString = requestTimePattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
    val requestTimeFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'")
    val requestTime = requestTimeFormat.parse(requestTimeString)
    var cal: Calendar = Calendar.getInstance()
    cal.setTime(requestTime)
    val referenceDate1 = new PartialDateTime(cal, null)

    val contentPattern = """(?s)\r\nHTTP/1\.. 200(.+?)(\r\n){2,}(.+)WARC/1.0\r\nWARC-Type: metadata""".r
    val contentString = contentPattern.findFirstMatchIn(entity).map(_ group 3).getOrElse("")

    try {
      val de = pte.value.extractTimes(contentString)
      if (de.getEntries != null) {
    for (entry <- de.getEntries.asScala) {
      val pdt = entry.resolve(12 * 3600, referenceDate1)
      if (pdt != null) {
        val sectionWordsets = entry.getSentences.asScala.map(x => x.getTokens.asScala.toArray[String]).toArray
        val sectionData = (
          warcinfoID, recordID, entry.getId,
          new Timestamp(pdt.secondsSinceEpoch * 1000), pdt.uncertaintyInterval.toInt, sectionWordsets
          )
        result += sectionData
      }
    }
      }
    } catch {
      case e: Exception => println("\n" + "-" * 100 + "\n" + entity)
    }

    result
  }

  def initDB() = {
    driver = "org.postgresql.Driver"
    connectionString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl?user=postgres&password=postgres"
    Class.forName(driver)
  }

  def prepareDB() = {
    var conn: Connection = null

    try {
      conn = DriverManager.getConnection(connectionString)
      val statement = conn.createStatement()

      val tableResultSet = statement.executeQuery(
    """
      |SELECT table_name
      |    FROM information_schema.tables
      |    WHERE table_schema='public'
      |    AND table_type='BASE TABLE';
    """.stripMargin)
      val tablesToDelete = ListBuffer[String]()
      while (tableResultSet.next()) {
    tableResultSet.getString("table_name") match {
      case "warcinfo" => tablesToDelete.append("warcinfo")
      case "record" => tablesToDelete.append("record")
      case "section" => tablesToDelete.append("section")
      case "word" => tablesToDelete.append("word")
      case "wordset" => tablesToDelete.append("wordset")
      case _ =>
    }
      }
      for (tableName <- tablesToDelete) statement.executeUpdate("DROP TABLE " + tableName + ";")

      val storedProcedureResultSet = statement.executeQuery(
    """
      |SELECT  proname, prosrc
      |FROM    pg_catalog.pg_namespace n
      |JOIN    pg_catalog.pg_proc p
      |ON      pronamespace = n.oid
      |WHERE   nspname = 'public';
    """.stripMargin)
      val storedProcedureDeletions = ListBuffer[String]()
      while (storedProcedureResultSet.next()) {
    storedProcedureResultSet.getString("proname") match {
      case "update_word_ids" =>
        storedProcedureDeletions.append("DROP FUNCTION update_word_ids();")
      case _ =>
    }
      }
      statement.executeUpdate("DROP TRIGGER IF EXISTS update_word_ids_trigger ON wordset_occurrence;")
      for (storedProcedureDeletion <- storedProcedureDeletions) statement.executeUpdate(storedProcedureDeletion)

      statement.executeUpdate(
    """
      |CREATE TABLE warcinfo (
      |    warcinfo_id serial PRIMARY KEY,
      |    batch_name varchar NOT NULL,
      |    warcinfo_uuid char(36) NOT NULL
      |);
    """.stripMargin)
      statement.executeUpdate(
    """
      |CREATE TABLE record (
      |    record_id serial PRIMARY KEY,
      |    record_uuid char(36) NOT NULL
      |);
    """.stripMargin)
      statement.executeUpdate(
    """
      |CREATE TABLE section (
      |    section_id serial PRIMARY KEY,
      |    record_id integer NOT NULL,
      |    section_name integer NOT NULL,
      |    timestamp timestamp NOT NULL,
      |    uncertainty integer NOT NULL
      |);
    """.stripMargin)
      statement.executeUpdate(
    """
      |CREATE TABLE word (
      |    word_id  serial PRIMARY KEY,
      |    word varchar NOT NULL
      |);
    """.stripMargin)
      statement.executeUpdate(
    """
      |CREATE TABLE wordset (
      |    section_id integer NOT NULL,
      |    wordset integer ARRAY
      |);
    """.stripMargin)
    } catch {
      case e: SQLException => println("exception caught: " + e)
    } finally {
      if (conn != null) conn.close()
    }
  }

  def processFile(fileNames: Array[String], accessKeyId: String = "", secretAccessKey: String = ""): Unit = {
    val delimiter = "WARC/1.0\r\nWARC-Type: request\r\n"
    pte = sc.broadcast(new TimeExtractor)

    val spark = SparkSession
      .builder()
      .appName("CommoncrawlExtractor")
      .getOrCreate()
    import spark.implicits._

    val connString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl"
    val prop = new Properties()
    prop.put("user", "postgres")
    prop.put("password", "postgres")

    val entities = sc.
      textFile(fileNames.mkString(",")).
      mapPartitions(dropFirst).
      map(delimiter + _).
      flatMap(extractSentences).
      map(x => SectionData(x._1, x._2, x._3, x._4, x._5, x._6)).toDF().
      cache()

    val warcinfo = entities.select("warcinfoID").distinct().
      withColumnRenamed("warcinfoID", "warcinfo_uuid").
      withColumn("batch_name", lit("June 2016, batch 1"))
    val warcinfoWriter = warcinfo.write.mode("append")
    println("Saving warcinfo.")
    println(Calendar.getInstance().getTime)
    warcinfoWriter.jdbc(connString, "warcinfo", prop)
    println(Calendar.getInstance().getTime)

    val record = entities.select("recordID").distinct().
      withColumnRenamed("recordID", "record_uuid")
    val recordWriter = record.write.mode("append")
    println("Saving records.")
    println(Calendar.getInstance().getTime)
    recordWriter.jdbc(connString, "record", prop)
    println(Calendar.getInstance().getTime)

    val recordFull = spark.read.
      format("jdbc").
      options(Map("url" -> connString, "dbtable" -> "public.record", "user" -> "postgres", "password" -> "postgres")).
      load().cache()

    val section = entities.
      join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
      select("record_id", "sectionName", "timestamp", "uncertainty").distinct().
      withColumnRenamed("sectionName", "section_name")
    val sectionWriter = section.write.mode("append")
    println("Saving sections.")
    println(Calendar.getInstance().getTime)
    sectionWriter.jdbc(connString, "section", prop)
    println(Calendar.getInstance().getTime)

    val sectionFull = spark.read.
      format("jdbc").
      options(Map("url" -> connString, "dbtable" -> "public.section", "user" -> "postgres", "password" -> "postgres")).
      load()

    val word = entities.
      select("wordsets").
      flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").flatten).
      distinct().
      map(Word(_))
    val wordWriter = word.write.mode("append")
    wordWriter.jdbc(connString, "word", prop)

    val wordFull = spark.read.
      format("jdbc").
      options(Map("url" -> connString, "dbtable" -> "public.word", "user" -> "postgres", "password" -> "postgres")).
      load().
      map(row => (row.getAs[String]("word"), row.getAs[Int]("word_id"))).
      collect().
      toMap

    val wordsetTemp = entities.
      join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
      withColumnRenamed("sectionName", "section_name")
    val wordset = wordsetTemp.
      join(sectionFull, Seq("record_id", "section_name")).
      select("section_id", "wordsets").
      flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").map(x => Wordset(r.getAs[Int]("section_id"), x.map(wordFull))))
    val wordsetWriter = wordset.write.mode("append")
    println("Saving wordsets.")
    println(Calendar.getInstance().getTime)
    wordsetWriter.jdbc(connString, "wordset", prop)
    println(Calendar.getInstance().getTime)

//    entities.saveAsTextFile(helper.outputDirectory + "xyz")

    sc.stop
  }

  override def main(args: Array[String]): Unit = {
    if (args.length >= 2) {
      initDB()
      prepareDB()

      helper = new Helper
      val files =
    if (args(0).startsWith("hdfs://")) helper.getHDFSFiles(args(0)).slice(0, args(3).toInt)
    else helper.getLocalFiles(args(0))

      val appName = "CommoncrawlExtractor"
      val conf = new SparkConf().setAppName(appName)
      if (args(0).startsWith("hdfs://")) {
    conf.set("spark.executor.instances", args(1))
    conf.set("spark.executor.cores", args(2))
      } else conf.setMaster(args(1))
      sc = new SparkContext(conf)
      val delimiter = "WARC/1.0\r\nWARC-Type: request"
      sc.hadoopConfiguration.set("textinputformat.record.delimiter", delimiter)
      processFile(files)
    }
  }
}

我将postgresql-9.4.1209.jre7.jar复制到了claster中每台机器上的/ home / user / Programs / libs,并使用以下命令(从Spark的目录运行):

./bin/spark-submit --master yarn --deploy-mode client --driver-class-path /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --jars /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --conf "spark.driver.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" --conf "spark.executor.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" spark-cortex-fat.jar hdfs://LV-WS10.lviv:9000/commoncrawl 2 4 8

请建议我如何使其在群集上运行。

后来添加:

我发现了这些行

val warcinfo = entities.select("warcinfoID").
  withColumnRenamed("warcinfoID", "warcinfo_uuid").
  withColumn("batch_name", lit("June 2016, batch 1"))
val warcinfoWriter = warcinfo.write.mode("append")
println("Saving warcinfo.")
println(Calendar.getInstance().getTime)
warcinfoWriter.jdbc(connString, "warcinfo", prop)
println(Calendar.getInstance().getTime)

导致异常

16/09/01 17:31:51 WARN scheduler.TaskSetManager: Lost task 0.1 in stage 1.0 (TID 5, LV-WS09): org.apache.spark.storage.BlockFetchException: Failed to fetch block after 1 fetch failures. Most recent failure cause:
    at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:565)
    at org.apache.spark.storage.BlockManager.getRemoteValues(BlockManager.scala:522)
    at org.apache.spark.storage.BlockManager.get(BlockManager.scala:609)
    at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:661)
    at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
    at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:96)
    at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:95)
    at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1203)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
    at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1325)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1211)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1190)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
    at org.apache.spark.scheduler.Task.run(Task.scala:85)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult: 
    at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:194)
    at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:104)
    at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:554)
    ... 31 more
Caused by: java.io.IOException: Failed to connect to ubuntu-cluster-4/192.168.100.139:36378
    at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:228)
    at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:179)
    at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:96)
    at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
    at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
    at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    ... 3 more
Caused by: java.net.ConnectException: Connection refused: ubuntu-cluster-4/192.168.100.139:36378
    at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
    at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717)
    at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
    at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
    at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
    at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
    at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
    at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
    at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
    ... 1 more

但是,某些记录存储在数据库中。 你会建议什么?

后来添加:

我查看停止响应的节点上的YARN日志,但它们没有帮助:logs

0 个答案:

没有答案