我编写了一个自定义InputDStream
,并在compute
函数中创建了RDD,方法是从HBase中读取内容,然后进行一些映射,例如:
class DataStream(ssc: StreamingContext, seedOpt: Option[Long] = None) extends InputDStream[Array[Byte]](ssc) {
private var random: Random = _
private var rdd: RDD[(Int, Array[Byte])] = _
private var rddCreationTime: Time = _
private val config = Config("mytable", "colfamilty", 3, 10)
private val numImportanceLevels = 10
private val importanceLowerFraction = 0.1
private val importanceUpperFraction = 0.8
private val fractions = linearImportanceFractions(
importanceLowerFraction, importanceUpperFraction, numImportanceLevels
)
override def start(): Unit = {
seedOpt match {
case Some(seed) => random = new Random(seed)
case None => random = new Random()
}
}
override def stop(): Unit = {
}
private def needRecreateRdd(currentTime: Time): Boolean = {
if (rdd == null) true
else {
val duration: Duration = currentTime - rddCreationTime
duration > Seconds(60)
}
}
private def getRdd(currentTime: Time) = {
if (needRecreateRdd(currentTime)) {
rddCreationTime = currentTime
val endTime = ZonedDateTime()
val beginTime = endTime.minusDays(30)
rdd = config.createTrainingRddWithImportance(ssc.sparkContext, beginTime, endTime, numImportanceLevels).cache()
}
rdd
}
override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
//println(s"----> $validTime")
val samples = getRdd(validTime)
.sampleByKey(withReplacement = false, fractions, random.nextLong())
.map({case (importance, tfRecord) => tfRecord})
Some(samples)
}
}
它工作正常,我可以从Python运行程序中获取字节数据(我正在使用TensorflowOnSpark)。但是,如果我检查Spark UI的“流式传输”选项卡,则据说我所有的迷你批处理都有0条记录:
那是为什么?