Question

我正在Spark 1.3.1上运行以下程序。 Spark Streaming正在HDFS中查看新文件中的目录，并应在它们进入时对其进行处理。我已经读过，最好的方法是从现有的HDFS位置移动文件，以便操作是原子的。

我开始我的流媒体工作，我将一堆小文件添加到随机HDFS目录，然后我将这些文件从原始HDFS目录移动到观看的HDFS目录（所有这些都使用简单的shell命令）。但我的流媒体作业并未将这些视为新文件，因此无法对其进行处理。

目前我正在使用textFileStream，但我愿意使用fileStream。但是我收到此val lines = ssc.fileStream[LongWritable, Text, TextInputFormat]("hdfs:///name/spark-streaming/data/", (p: Path)=>true, false)

的错误

package com.com.spark.prototype

import java.io.FileInputStream
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark._
import org.apache.spark.streaming._
import com.twitter.algebird.HyperLogLogMonoid
import org.apache.hadoop.io._


object HLLStreamingHDFSTest {

  def functionToCreateContext(): StreamingContext = {
    val conf = new SparkConf().set("spark.executor.extraClassPath", "/home/hadoop/spark/conf:/home/hadoop/conf:/home/hadoop/spark/classpath/emr/*:/home/hadoop/spark/classpath/emrfs/*:/home/hadoop/share/hadoop/common/lib/*:/home/hadoop/share/hadoop/common/lib/hadoop-lzo.jar")
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.checkpoint("/name/spark-streaming/checkpointing")

    val lines = ssc.textFileStream("hdfs:///name/spark-streaming/data/")

    val hll = new HyperLogLogMonoid(15)
    var globalHll = hll.zero
    val users = lines.map(_.toString().toCharArray.map(_.toByte))

    val approxUsers = users.mapPartitions(ids => {
      ids.map(id => hll(id))
    }).reduce(_ + _)

    approxUsers.foreachRDD(rdd => {
      if (rdd.count() != 0) {
        val partial = rdd.first()
        globalHll += partial

        println()
        println()
        println("Estimated distinct users this batch: %d".format(partial.estimatedSize.toInt))
        println("Estimated distinct users this batch: %d".format(globalHll.estimatedSize.toInt))
        println()
        println("Approx distinct users this batch: %s".format(partial.approximateSize.toString))
        println("Approx distinct users overall: %s".format(globalHll.approximateSize.toString))
      }
    })

    ssc
  }


  def main(args: Array[String]): Unit = {

    val context = StreamingContext.getOrCreate("hdfs:///name/spark-streaming/checkpointing", functionToCreateContext _)

    context.start()
    context.awaitTermination()

  }

}

Spark Streaming未检测到新的HDFS文件

0 个答案: