model.freqItemsets
FPGROWTH算法是Spark 2.4,无法显示完整的16gb数据集的任何结果,但是相同的模型或代码适用于1Gb样本数据集,这些数据集是16GB数据集中的子集或样本数据集>
import org.apache.spark._
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext._
import org.apache.log4j.Level
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._
import scala.io.Source
import java.nio.charset.CodingErrorAction
import scala.io.Codec
import org.apache.spark.mllib.recommendation._
import org.apache.spark.ml.fpm.FPGrowth
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.fpm.FPGrowthModel
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.explode
object Full_Data_Association_4 {
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val ss = SparkSession
.builder
.appName("Fpgrowth_1").getOrCreate()
import ss.implicits._
val in = ss.read.textFile(args(0))
val in_2 = in.map(x => x.split("\t")(1))
val in_3 = in_2.map(t => t.split(",")).toDF("items")
val fpgrowth = new FPGrowth().setItemsCol("items")
.setMinSupport(0.1).setMinConfidence(0.6)
val model = fpgrowth.fit(in_3)
model.freqItemsets.show(300)
}
我得到以下输出:
+-----+----+
|items|freq|
+-----+----+
+-----+----+
答案 0 :(得分:0)
这意味着,在最小支持度0.1和最小置信度0.6的情况下,没有结果。尝试提供其他值,例如最小支持为.001,您可能会得到一些结果(取决于数据集)