我目前正在测试Apache Mahout Parallel Frequent Pattern Mining。在真正的项目中使用它之前,我开始使用一个简单的代码,只是为了确保它能像我期望的那样工作......
我没有找到包含代码,数据和输出的完整示例。
我目前有一个编译和执行版本(参见下面的java / scala代码),但返回的频繁模式只包含一个元组(参见下面的示例输出)。
这是预期的行为吗? 我做错了什么?
感谢您的帮助......
scala代码:
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
import java.util.HashSet
import org.apache.mahout.common.iterator.StringRecordIterator
import org.apache.mahout.common.iterator.FileLineIterable
import org.apache.mahout.fpm.pfpgrowth.convertors._
import org.apache.mahout.fpm.pfpgrowth.convertors.integer._
import org.apache.mahout.fpm.pfpgrowth.convertors.string._
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater
import org.apache.hadoop.mapred.OutputCollector
import scala.collection.JavaConversions._
import java.util.{ List => JList }
import org.apache.mahout.common.{ Pair => JPair }
import java.lang.{ Long => JLong }
import org.apache.hadoop.io.{ Text => JText }
val minSupport = 5L
val k: Int = 50
val fps: FPGrowth[String] = new FPGrowth[String]()
val milk = "milk"
val bread = "bread"
val butter = "butter"
val bier = "bier"
val transactionStream: Iterator[JPair[JList[String], JLong]] = Iterator(
new JPair(List(milk, bread), 10L),
new JPair(List(butter), 10L),
new JPair(List(bier), 10L),
new JPair(List(milk, bread, butter), 5L),
new JPair(List(milk, bread, bier), 5L),
new JPair(List(bread), 10L)
)
val frequencies: Collection[JPair[String, JLong]] = fps.generateFList(
transactionStream, minSupport.toInt)
println("freqList :" + frequencies)
var returnableFeatures: Collection[String] = List(
milk, bread, butter, bier)
var output: OutputCollector[String, JList[JPair[JList[String], JLong]]] = (
new OutputCollector[String, JList[JPair[JList[String], JLong]]] {
def collect(x1: String,
x2: JList[JPair[JList[String], JLong]]) = {
println(x1 + ":" +
x2.map(pair => "[" + pair.getFirst.mkString(",") + "] : " +
pair.getSecond).mkString("; "))
}
}
)
val updater: StatusUpdater = new StatusUpdater {
def update(status: String) = println("updater : " + status)
}
fps.generateTopKFrequentPatterns(
transactionStream,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater)
java代码:
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
transactions, (int) minSupport);
System.out.println("freqList :" + frequencies);
Collection<String> returnableFeatures =
Arrays.asList(milk, bread, butter, bier);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status){
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
transactions,
frequencies,
minSupport,
k,
null, //returnableFeatures
output,
updater);
}catch (Exception e){
e.printStackTrace();
}
}
}
示例输出:
freqList :[(bread,4), (milk,4), (bier,2), (butter,2)]
17:48:19,108 INFO ~ Number of unique items 4
17:48:19,109 INFO ~ Number of unique pruned items 4
17:48:19,121 INFO ~ Number of Nodes in the FP Tree: 0
17:48:19,122 INFO ~ Mining FTree Tree for all patterns with 3
updater :FPGrowth Algorithm for a given feature: 3
butter:[butter]:2
17:48:19,130 INFO ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO ~ Mining FTree Tree for all patterns with 2
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
bier:[bier]:2
17:48:19,130 INFO ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO ~ Mining FTree Tree for all patterns with 1
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
milk:[milk]:4
17:48:19,131 INFO ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO ~ Mining FTree Tree for all patterns with 0
updater :FPGrowth Algorithm for a given feature: 0
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
bread:[bread]:4
17:48:19,131 INFO ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO ~ Tree Cache: First Level: Cache hits=6 Cache Misses=4
答案 0 :(得分:3)
代码是错误的:事务上的迭代器首先被调用来计算频率,并且将被fp-growth算法再次调用。问题是第二次调用没有返回任何值,因为迭代器已经到了终点......
供参考,这是正确的java代码:
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;
import java.io.IOException;
import java.util.*;
import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;
import org.apache.hadoop.io.Text;
class FPGrowthDemo {
public static void main(String[] args) {
long minSupport = 1L;
int k = 50;
FPGrowth<String> fps = new FPGrowth<String>();
String milk = "milk";
String bread = "bread";
String butter = "butter";
String bier = "bier";
LinkedList<Pair<List<String>, Long>> data =
new LinkedList<Pair<List<String>, Long>>();
data.add(new Pair(Arrays.asList(milk, bread), 1L));
data.add(new Pair(Arrays.asList(butter), 1L));
data.add(new Pair(Arrays.asList(bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
data.add(new Pair(Arrays.asList(milk, bread), 1L));
// This lines is removed...
// Iterator<Pair<List<String>, Long>> transactions = data.iterator();
Collection<Pair<String, Long>> frequencies = fps.generateFList(
data.iterator(), // use an iterator here...
(int) minSupport);
System.out.println("freqList :" + frequencies);
OutputCollector<String, List<Pair<List<String>, Long>>> output =
new OutputCollector<String, List<Pair<List<String>, Long>>>() {
@Override
public void collect(String x1,
List<Pair<List<String>, Long>> listPair)
throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(x1 + ":");
for (Pair<List<String>, Long> pair : listPair) {
sb.append("[");
String sep = "";
for (String item : pair.getFirst()) {
sb.append(item + sep);
sep = ", ";
}
sb.append("]:" + pair.getSecond());
}
System.out.println(" " + sb.toString());
}
};
StatusUpdater updater = new StatusUpdater() {
public void update(String status) {
System.out.println("updater :" + status);
}
};
try {
fps.generateTopKFrequentPatterns(
// changed here (previously : transactions)
data.iterator(), // use a "fresh" iterator
frequencies,
minSupport,
k,
null,
output,
updater);
} catch (Exception e) {
e.printStackTrace();
}
}
}