如何在不使用UDF的情况下计算Apache Pig中数据集的统计模式?
A,20
A,10
A,10
B,40
B,40
B,20
B,10
data = LOAD 'myData.txt' USING PigStorage(',') AS key, value;
byKey = GROUP data BY key;
mode = FOREACH byKey GENERATE MODE(data.value); -- How to define MODE() ??
DUMP mode;
-- Correct answer: (A, 10), (B, 40)
答案 0 :(得分:3)
这是一个每个键只找到一个结果的版本:
data = LOAD 'mode_data.dat' USING PigStorage(',') AS (key, value);
byKeyValue = GROUP data BY (key, value);
cntKeyValue = FOREACH byKeyValue GENERATE FLATTEN(group) AS (key, value), COUNT(data) as cnt;
byKey = GROUP cntKeyValue BY key;
mode = FOREACH byKey {
freq = ORDER cntKeyValue BY cnt DESC;
topFreq = LIMIT freq 1; -- one of the most frequent values for key of the group
GENERATE FLATTEN(topFreq.(key, value));
};
此版本将找到同一密钥的所有同等频繁值:
data = LOAD 'mode_data.dat' USING PigStorage(',') AS (key, value);
byKeyValue = GROUP data BY (key, value);
cntKeyValue = FOREACH byKeyValue GENERATE FLATTEN(group) AS (key, value), COUNT(data) as cnt;
byKey = GROUP cntKeyValue BY key;
mostFreqCnt = FOREACH byKey { -- calculate the biggest count for each key
freq = ORDER cntKeyValue BY cnt DESC;
topFreq = LIMIT freq 1;
GENERATE FLATTEN(topFreq.(key, cnt)) as (key, cnt);
};
modeAll = COGROUP cntKeyValue BY (key, cnt), mostFreqCnt BY (key, cnt); -- get all values with the same count and same key, used cogroup as next command was throwing some errors during execution
mode = FOREACH (FILTER modeAll BY not IsEmpty(mostFreqCnt)) GENERATE FLATTEN(cntKeyValue.(key, value)) as (key, value);
答案 1 :(得分:1)
我有一个简单的UDF用于计算模式(它使用apache commons-math3,pig 0.10.0):
public class MODE extends EvalFunc<DataBag> {
TupleFactory mTupleFactory = TupleFactory.getInstance();
BagFactory mBagFactory = BagFactory.getInstance();
public DataBag exec(Tuple inputTuple) throws IOException {
if (inputTuple == null || inputTuple.size() == 0) {
return null;
}
try {
Frequency frequency = new Frequency();
DataBag output = mBagFactory.newDefaultBag();
DataBag values = (DataBag) inputTuple.get(0);
for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
Tuple tuple = it.next();
frequency.addValue((Long) tuple.get(0));
}
Set<Long> setOfMostFrequentVals = new HashSet<Long>();
Long greatestFrequency = 0l;
for (Iterator<Comparable<?>> it = frequency.valuesIterator(); it.hasNext();) {
Long val = (Long) it.next();
if (frequency.getCount(val) >= greatestFrequency) {
if (frequency.getCount(val) > greatestFrequency) {
setOfMostFrequentVals.clear();
greatestFrequency = frequency.getCount(val);
}
setOfMostFrequentVals.add(val);
}
}
for (Long mostFequentVal : setOfMostFrequentVals) {
output.add(mTupleFactory.newTuple(mostFequentVal));
}
return output;
} catch (Exception e) {
int errCode = 2106;
String msg = "Error while computing mode in " + this.getClass().getSimpleName();
throw new ExecException(msg, errCode, PigException.BUG, e);
}
}
}
答案 2 :(得分:0)