我正在使用Spark Streaming,并批量处理传入数据流(推文)。
这里是流发生的情况的图表。我将流分成2个流。
All batches appended to a single giant Dataframe, and models are
repeatedly re-fitted with .fit() on this entire Dataframe
/
Stream
\
New batches turned to dataframes and the previously fitted
models are used to .transform() these batches
代码:
// GLOBAL model variables that are to be updated by re-fitting on big
// dataframe
val word2vec = new Word2Vec().setInputCol("tokens").setOutputCol("raw_features")
var word2vecModel:Word2VecModel = null
val testKMeans = new KMeans()
.setK(5)
.setFeaturesCol("raw_features")
.setPredictionCol("prediction")
.setMaxIter(50)
.setSeed(2)
var kmeansModel:KMeansModel = null
var countVec = new CountVectorizer()
.setInputCol("tokens")
.setOutputCol("vec_features")
.setMinDF(2)
.setVocabSize(10000)
var countVecModel:CountVectorizerModel = null
var vocab_broadcast:Broadcast[scala.Array[scala.Predef.String]] = null
val lda = new LDA().setFeaturesCol("vec_features")
lda.setOptimizer("online").setK(10)
var ldaModel:LDAModel = null
//Accumulator
cachedValidStream.map{ item =>
Row(item._2._2("content").asInstanceOf[String])
}.foreachRDD { rdd =>
val spark = SparkSession.builder().getOrCreate()
val initial = spark.createDataFrame(rdd,
new StructType()
.add("content", StringType))
val df = initial.dropDuplicates()
val tokenizer: Tokenizer = new Tokenizer()
.setInputCol("content")
.setOutputCol("tokens_raw")
val remover = new StopWordsRemover()
.setInputCol("tokens_raw")
.setOutputCol("tokens")
.setStopWords(lisOfStopWords)
.setCaseSensitive(false)
val tokenized: DataFrame = tokenizer.transform(df)
val filtered: DataFrame = remover.transform(tokenized)
// Giant accumulated DF
if(accumulatorDF == null){
accumulatorDF = filtered
} else {
accumulatorDF = accumulatorDF.union(filtered)
}
// Re-fitting on this giant Dataframe
if(accumulatorDF.count() > 0){
word2vecModel = word2vec.fit(accumulatorDF)
countVecModel = countVec.fit(accumulatorDF)
val vocab = countVecModel.vocabulary
if(vocab_broadcast != null){
vocab_broadcast.unpersist()
}
vocab_broadcast = sc.broadcast(vocab)
}
}
// The other stream, where each batch is transformed using previously
// fitted models
cachedValidStream.map { item =>
Row(item._2._2("content").asInstanceOf[String],
item._2._2("published_date"),
item._2._2("retweet_count"),
item._2._2("post_url"),
item._2._2("hashtags").asInstanceOf[List[String]].mkString(","),
item._2._2("post_id"))
}.foreachRDD { rdd =>
val spark = SparkSession.builder().getOrCreate()
val initial = spark.createDataFrame(rdd,
new StructType()
.add("content", StringType)
.add("published_date", StringType)
.add("retweet_count", DoubleType)
.add("post_url", StringType)
.add("hashtags", StringType)
.add("post_id", StringType))
val df = initial.dropDuplicates(Seq("content"))
val tokenizer: Tokenizer = new Tokenizer()
.setInputCol("content")
.setOutputCol("tokens_raw")
val remover = new StopWordsRemover()
.setInputCol("tokens_raw")
.setOutputCol("tokens")
.setStopWords(lisOfStopWords)
.setCaseSensitive(false)
val tokenized: DataFrame = tokenizer.transform(df)
val filtered: DataFrame = remover.transform(tokenized)
if(filtered.count() > 0){
// here is where the BATCHES of data are TRANSFORMED using previously fitted models
val featurizedData = word2vecModel.transform(filtered)
val KMeansDF = testKMeans.fit(featurizedData).transform(featurizedData)
val ldapathway = countVecModel.transform(featurizedData)
ldaModel = lda.fit(ldapathway)
val ldatopics = ldaModel.describeTopics()
ldatopics.show(25)
val index2term = udf { (indices: mutable.WrappedArray[_]) =>
indices.map {
case v: Int => vocab_broadcast.value(v)
}
}
val ldaResults = ldaModel.transform(ldapathway)
....
所以基本上,这是在Spark Streaming中更新模型的正确方法吗?要保持累积表以不断进行调整并使用该调整后的模型转换新批次?