我的架构现在是AWS ELB将日志写入S3并向SQS发送消息以供Spark Streaming进一步处理。它现在正在工作,但我现在的问题是花了一点时间。我是Spark和Scala的新手,所以只是想确保我没有做一些愚蠢的事。
val conf = new SparkConf()
.setAppName("SparrowOrc")
.set("spark.hadoop.fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
.set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version","2")
.set("spark.speculation","false")
val sc = new SparkContext(conf)
val streamContext = new StreamingContext(sc, Seconds(1))
val sqs = streamContext.receiverStream(new SQSReceiver("queue")
.at(Regions.US_EAST_1)
.withTimeout(5))
// Got 10 messages at a time
val s3Keys = sqs.map(messages => {
val sqsMsg: JsValue = Json.parse(messages)
val s3Key = "s3://" +
Json.stringify(sqsMsg("Records")(0)("s3")("bucket")("name")).replace("\"", "") + "/" +
Json.stringify(sqsMsg("Records")(0)("s3")("object")("key")).replace("\"", "")
s3Key
})
val rawLogs: DStream[String] = s3Keys.transform(keys => {
val fileKeys = keys.collect()
val files = fileKeys.map(f => {
sc.textFile(f)
})
sc.union(files)
})
val jsonRows = rawLogs.mapPartitions(partition => {
// Parsing raw log to json
val txfm = new LogLine2Json
val log = Logger.getLogger("parseLog")
partition.map(line => {
try{
txfm.parseLine(line)
}
catch {
case e: Throwable => {log.info(line); "";}
}
}).filter(line => line != "{}")
})
val sqlSession = SparkSession
.builder()
.getOrCreate()
// Write to S3
jsonRows.foreachRDD(r => {
val parsedFormat = new SimpleDateFormat("yyyy-MM-dd/")
val parsedDate = parsedFormat.format(new java.util.Date())
val outputPath = "bucket" + parsedDate
val jsonDf = sqlSession.read.schema(sparrowSchema.schema).json(r)
jsonDf.write.mode("append").format("orc").option("compression","zlib").save(outputPath)
})
streamContext.start()
streamContext.awaitTermination()
}
这里是DAG,似乎所有内容都在union
转换中合并。