在累积数据上重新装配特征提取器和模型

时间:2018-08-06 18:16:26

标签: scala apache-spark machine-learning spark-streaming apache-spark-mllib

我正在使用Spark Streaming,并批量处理传入数据流(推文)。

这里是流发生的情况的图表。我将流分成2个流。

       All batches appended to a single giant Dataframe, and models are 
       repeatedly re-fitted with .fit() on this entire Dataframe
      /
Stream
      \
       New batches turned to dataframes and the previously fitted 
       models are used to .transform() these batches

代码:

// GLOBAL model variables that are to be updated by re-fitting on big 
// dataframe
val word2vec = new Word2Vec().setInputCol("tokens").setOutputCol("raw_features")
var word2vecModel:Word2VecModel = null

val testKMeans = new KMeans()
  .setK(5)
  .setFeaturesCol("raw_features")
  .setPredictionCol("prediction")
  .setMaxIter(50)
  .setSeed(2)
var kmeansModel:KMeansModel = null

var countVec = new CountVectorizer()
  .setInputCol("tokens")
  .setOutputCol("vec_features")
  .setMinDF(2)
  .setVocabSize(10000)
var countVecModel:CountVectorizerModel = null

var vocab_broadcast:Broadcast[scala.Array[scala.Predef.String]] = null

val lda = new LDA().setFeaturesCol("vec_features")
lda.setOptimizer("online").setK(10)
var ldaModel:LDAModel = null

//Accumulator
cachedValidStream.map{ item =>
  Row(item._2._2("content").asInstanceOf[String])
}.foreachRDD { rdd =>
  val spark = SparkSession.builder().getOrCreate()
  val initial = spark.createDataFrame(rdd,
    new StructType()
      .add("content", StringType))

  val df = initial.dropDuplicates()

  val tokenizer: Tokenizer = new Tokenizer()
    .setInputCol("content")
    .setOutputCol("tokens_raw")
  val remover = new StopWordsRemover()
    .setInputCol("tokens_raw")
    .setOutputCol("tokens")
    .setStopWords(lisOfStopWords)
    .setCaseSensitive(false)
  val tokenized: DataFrame = tokenizer.transform(df)
  val filtered: DataFrame = remover.transform(tokenized)

  // Giant accumulated DF
  if(accumulatorDF == null){
    accumulatorDF = filtered
  } else {
    accumulatorDF = accumulatorDF.union(filtered)
  }
  // Re-fitting on this giant Dataframe
  if(accumulatorDF.count() > 0){
    word2vecModel = word2vec.fit(accumulatorDF)
    countVecModel = countVec.fit(accumulatorDF)
    val vocab = countVecModel.vocabulary
    if(vocab_broadcast != null){
      vocab_broadcast.unpersist()
    }
    vocab_broadcast = sc.broadcast(vocab)
  }
}
// The other stream, where each batch is transformed using previously 
// fitted models
cachedValidStream.map { item =>
  Row(item._2._2("content").asInstanceOf[String],
    item._2._2("published_date"),
    item._2._2("retweet_count"),
    item._2._2("post_url"),
    item._2._2("hashtags").asInstanceOf[List[String]].mkString(","),
    item._2._2("post_id"))
}.foreachRDD { rdd =>
  val spark = SparkSession.builder().getOrCreate()
  val initial = spark.createDataFrame(rdd,
    new StructType()
      .add("content", StringType)
      .add("published_date", StringType)
      .add("retweet_count", DoubleType)
      .add("post_url", StringType)
      .add("hashtags", StringType)
      .add("post_id", StringType))
  val df = initial.dropDuplicates(Seq("content"))
  val tokenizer: Tokenizer = new Tokenizer()
    .setInputCol("content")
    .setOutputCol("tokens_raw")
  val remover = new StopWordsRemover()
    .setInputCol("tokens_raw")
    .setOutputCol("tokens")
    .setStopWords(lisOfStopWords)
    .setCaseSensitive(false)
  val tokenized: DataFrame = tokenizer.transform(df)
  val filtered: DataFrame = remover.transform(tokenized)

  if(filtered.count() > 0){
    // here is where the BATCHES of data are TRANSFORMED using previously fitted models
    val featurizedData = word2vecModel.transform(filtered)
    val KMeansDF = testKMeans.fit(featurizedData).transform(featurizedData)

    val ldapathway = countVecModel.transform(featurizedData)
    ldaModel = lda.fit(ldapathway)
    val ldatopics = ldaModel.describeTopics()

    ldatopics.show(25)

    val index2term = udf { (indices: mutable.WrappedArray[_]) =>
      indices.map {
        case v: Int => vocab_broadcast.value(v)
      }
    }
    val ldaResults = ldaModel.transform(ldapathway)

    ....

所以基本上,这是在Spark Streaming中更新模型的正确方法吗?要保持累积表以不断进行调整并使用该调整后的模型转换新批次?

0 个答案:

没有答案