Question

我的架构现在是AWS ELB将日志写入S3并向SQS发送消息以供Spark Streaming进一步处理。它现在正在工作，但我现在的问题是花了一点时间。我是Spark和Scala的新手，所以只是想确保我没有做一些愚蠢的事。

val conf = new SparkConf()
  .setAppName("SparrowOrc")
  .set("spark.hadoop.fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
  .set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version","2")
  .set("spark.speculation","false")


val sc = new SparkContext(conf)
val streamContext = new StreamingContext(sc, Seconds(1))

val sqs = streamContext.receiverStream(new SQSReceiver("queue")
  .at(Regions.US_EAST_1)
  .withTimeout(5))

// Got 10 messages at a time
val s3Keys = sqs.map(messages => {      
  val sqsMsg: JsValue = Json.parse(messages)
  val s3Key = "s3://" +
    Json.stringify(sqsMsg("Records")(0)("s3")("bucket")("name")).replace("\"", "") + "/" +
    Json.stringify(sqsMsg("Records")(0)("s3")("object")("key")).replace("\"", "")
  s3Key
})

val rawLogs: DStream[String] = s3Keys.transform(keys => {
  val fileKeys = keys.collect()
  val files = fileKeys.map(f => {
    sc.textFile(f)
  })
  sc.union(files)
})

val jsonRows = rawLogs.mapPartitions(partition => {
  // Parsing raw log to json
  val txfm = new LogLine2Json
  val log = Logger.getLogger("parseLog")
  partition.map(line => {
    try{
      txfm.parseLine(line)
    }
    catch {
      case e: Throwable => {log.info(line); "";}
    }
  }).filter(line => line != "{}")
})

val sqlSession = SparkSession
  .builder()
  .getOrCreate()

// Write to S3
jsonRows.foreachRDD(r => {
  val parsedFormat = new SimpleDateFormat("yyyy-MM-dd/")
  val parsedDate = parsedFormat.format(new java.util.Date())
  val outputPath = "bucket" + parsedDate
  val jsonDf = sqlSession.read.schema(sparrowSchema.schema).json(r)
  jsonDf.write.mode("append").format("orc").option("compression","zlib").save(outputPath)
})

streamContext.start()
streamContext.awaitTermination()
}

这里是DAG，似乎所有内容都在union转换中合并。

如何加快日志解析Spark工作？

0 个答案: