我正在Spark 1.3.1上运行以下程序。 Spark Streaming正在HDFS中查看新文件中的目录,并应在它们进入时对其进行处理。我已经读过,最好的方法是从现有的HDFS位置移动文件,以便操作是原子的。
我开始我的流媒体工作,我将一堆小文件添加到随机HDFS目录,然后我将这些文件从原始HDFS目录移动到观看的HDFS目录(所有这些都使用简单的shell命令)。但我的流媒体作业并未将这些视为新文件,因此无法对其进行处理。
目前我正在使用textFileStream
,但我愿意使用fileStream
。但是我收到此val lines = ssc.fileStream[LongWritable, Text, TextInputFormat]("hdfs:///name/spark-streaming/data/", (p: Path)=>true, false)
package com.com.spark.prototype
import java.io.FileInputStream
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark._
import org.apache.spark.streaming._
import com.twitter.algebird.HyperLogLogMonoid
import org.apache.hadoop.io._
object HLLStreamingHDFSTest {
def functionToCreateContext(): StreamingContext = {
val conf = new SparkConf().set("spark.executor.extraClassPath", "/home/hadoop/spark/conf:/home/hadoop/conf:/home/hadoop/spark/classpath/emr/*:/home/hadoop/spark/classpath/emrfs/*:/home/hadoop/share/hadoop/common/lib/*:/home/hadoop/share/hadoop/common/lib/hadoop-lzo.jar")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("/name/spark-streaming/checkpointing")
val lines = ssc.textFileStream("hdfs:///name/spark-streaming/data/")
val hll = new HyperLogLogMonoid(15)
var globalHll = hll.zero
val users = lines.map(_.toString().toCharArray.map(_.toByte))
val approxUsers = users.mapPartitions(ids => {
ids.map(id => hll(id))
}).reduce(_ + _)
approxUsers.foreachRDD(rdd => {
if (rdd.count() != 0) {
val partial = rdd.first()
globalHll += partial
println()
println()
println("Estimated distinct users this batch: %d".format(partial.estimatedSize.toInt))
println("Estimated distinct users this batch: %d".format(globalHll.estimatedSize.toInt))
println()
println("Approx distinct users this batch: %s".format(partial.approximateSize.toString))
println("Approx distinct users overall: %s".format(globalHll.approximateSize.toString))
}
})
ssc
}
def main(args: Array[String]): Unit = {
val context = StreamingContext.getOrCreate("hdfs:///name/spark-streaming/checkpointing", functionToCreateContext _)
context.start()
context.awaitTermination()
}
}