线性回归垃圾邮件分类器无法正常工作的火花流

时间:2017-11-21 15:30:17

标签: apache-spark spark-streaming apache-spark-mllib apache-spark-ml

请帮我解决这个问题... 我正在尝试编写一个流媒体应用程序,使用spark streaming从文件夹接收邮件,并根据linera regresion尝试确定它们是否是垃圾邮件。应用程序正在运行,当我在目录中放置fies时,我看到没有任何反应......我不明白为什么请帮助.. 请注意,BOTH dirs trainingDir和testDirs的输入应采用以下形式 1/0 emailtext。这是代码

object Test{

def main(args: Array[String]) {

if (args.length != 2) {

     System.err.println("Usage: SpamDetector <trainingDir> <testDir>")
     System.exit(1)
}

val numFeatures = 10000

//Create conf object
val conf = new SparkConf().setAppName("StreamingLinearRegression")

// create spark context object
val sc = new SparkContext(conf)

// Create a StreamingContext with a 5-second batch size from a SparkConf
val ssc = new StreamingContext(sc, Seconds(5))

val tf = new HashingTF(numFeatures)

def convertEmailToLabledPoint(line: String): LabeledPoint =
{
    println(line)
    val lineParts = line.split(" ")

    // Each email is split into words, and each word is mapped to one 
    //feature.

    val features = tf.transform(line.split(" "))

     println(lineParts(0).toDouble)

     val res = LabeledPoint(lineParts(0).toDouble, features)

     res
  }

  // Create a DStream using data received from folder
  val lines = ssc.textFileStream(args(0))

  val trainingData = 
       ssc.textFileStream(args(1)).map(convertEmailToLabledPoint).cache()

  val testData = ssc.textFileStream(args(1)).map(convertEmailToLabledPoint)


  val model = new StreamingLinearRegressionWithSGD()
     .setInitialWeights(Vectors.zeros(numFeatures))

 model.trainOn(trainingData)
 model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

 ssc.start()

 ssc.awaitTermination()  
  }  
 }

我也尝试使用套接字而不是目录,但仍然无法工作....这里是代码:

object SpamDetector{

def main(args: Array[String]) {



val numFeatures = 10

//Create conf object
val conf = new SparkConf().setAppName("StreamingLinearRegression")

// create spark context object
val sc = new SparkContext(conf)

// Create a StreamingContext with a 5-second batch size from a SparkConf
val ssc = new StreamingContext(sc, Seconds(5))



def convertEmailToLabledPoint(line: String): LabeledPoint =
{
  val tf = new HashingTF(numFeatures)


  println(line)
  val lineParts = line.split(" ")

  // Each email is split into words, and each word is mapped to one feature.

  val features = tf.transform(line.split(" "))

  println(lineParts(0).toDouble)

  val res = LabeledPoint(lineParts(0).toDouble, features)

  res
}


// Create a DStream using socket for training data
val trainingData = ssc.socketTextStream("localhost", 19000).map(convertEmailToLabledPoint).cache()


// Create a DStream using socket for receiving emails
val testData     = ssc.socketTextStream("localhost", 19001).map(convertEmailToLabledPoint)

val model = new StreamingLinearRegressionWithSGD()
  .setInitialWeights(Vectors.zeros(numFeatures))

model.trainOn(trainingData)
model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

ssc.start()

ssc.awaitTermination()  

}    }

0 个答案:

没有答案