请帮我解决这个问题... 我正在尝试编写一个流媒体应用程序,使用spark streaming从文件夹接收邮件,并根据linera regresion尝试确定它们是否是垃圾邮件。应用程序正在运行,当我在目录中放置fies时,我看到没有任何反应......我不明白为什么请帮助.. 请注意,BOTH dirs trainingDir和testDirs的输入应采用以下形式 1/0 emailtext。这是代码
object Test{
def main(args: Array[String]) {
if (args.length != 2) {
System.err.println("Usage: SpamDetector <trainingDir> <testDir>")
System.exit(1)
}
val numFeatures = 10000
//Create conf object
val conf = new SparkConf().setAppName("StreamingLinearRegression")
// create spark context object
val sc = new SparkContext(conf)
// Create a StreamingContext with a 5-second batch size from a SparkConf
val ssc = new StreamingContext(sc, Seconds(5))
val tf = new HashingTF(numFeatures)
def convertEmailToLabledPoint(line: String): LabeledPoint =
{
println(line)
val lineParts = line.split(" ")
// Each email is split into words, and each word is mapped to one
//feature.
val features = tf.transform(line.split(" "))
println(lineParts(0).toDouble)
val res = LabeledPoint(lineParts(0).toDouble, features)
res
}
// Create a DStream using data received from folder
val lines = ssc.textFileStream(args(0))
val trainingData =
ssc.textFileStream(args(1)).map(convertEmailToLabledPoint).cache()
val testData = ssc.textFileStream(args(1)).map(convertEmailToLabledPoint)
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.zeros(numFeatures))
model.trainOn(trainingData)
model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
ssc.start()
ssc.awaitTermination()
}
}
我也尝试使用套接字而不是目录,但仍然无法工作....这里是代码:
object SpamDetector{
def main(args: Array[String]) {
val numFeatures = 10
//Create conf object
val conf = new SparkConf().setAppName("StreamingLinearRegression")
// create spark context object
val sc = new SparkContext(conf)
// Create a StreamingContext with a 5-second batch size from a SparkConf
val ssc = new StreamingContext(sc, Seconds(5))
def convertEmailToLabledPoint(line: String): LabeledPoint =
{
val tf = new HashingTF(numFeatures)
println(line)
val lineParts = line.split(" ")
// Each email is split into words, and each word is mapped to one feature.
val features = tf.transform(line.split(" "))
println(lineParts(0).toDouble)
val res = LabeledPoint(lineParts(0).toDouble, features)
res
}
// Create a DStream using socket for training data
val trainingData = ssc.socketTextStream("localhost", 19000).map(convertEmailToLabledPoint).cache()
// Create a DStream using socket for receiving emails
val testData = ssc.socketTextStream("localhost", 19001).map(convertEmailToLabledPoint)
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.zeros(numFeatures))
model.trainOn(trainingData)
model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
ssc.start()
ssc.awaitTermination()
} }