//calculate tf-idf of every word in every document)
public static class CalReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// Note: key is a word, values are in the form of
// (filename=frequency)
// sum up the number of files containing a particular word
// for every filename=frequency in the value, compute tf-idf of this
// word in filename and output (word@filename, tfidf)
答案 0 :(得分:0)
Text outputKey = new Text();
Text outputValue = new Text();
//calculate tf-idf of every word in every document)
public static class CalReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// Note: key is a word, values are in the form of
// (filename=frequency)
Map<String, Integer> tfs = new HashMap<>();
for (Text value: values) {
String[] valueParts = value.split("=");
tfs.put(valueParts[0], Integer.parseInt(valueParts[1])); //do the necessary checks here
int numDocs = context.getInt("noOfDocuments"); //set this in the Driver, if you know it already, or set a counter in the mapper to get it here using getCounter()
double IDF = Math.log10((double)numDocs/tfs.keySet().size());
// for every filename=frequency in the value, compute tf-idf of this
// word in filename and output (word@filename, tfidf)
for (String file : tfs.keySet()) {
outputValue.set(new String(tfs.get(file)*IDF)); //you could also set the outputValue to be a DoubleWritable
context.write(outputKey, outputValue);
如果您将tf定义为frequency / maxFrequency
你可以使用以下方法在Java 8中完成这个技巧:
long DF = values.spliterator().getExactSizeIfKnown();
double IDF = Math.log10((double)numDocs/DF);
如this post中所述,或者按照同一帖子中不使用循环的其他建议(否则,您可以按照上一个答案)。
Text outputKey = new Text();
Text outputValue = new Text();
//calculate tf-idf of every word in every document)
public static class CalReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int numDocs = context.getInt("noOfDocuments"); //set this in the Driver, if you know it already, or set a counter in the mapper to get it here using getCounter()
long DF = values.spliterator().getExactSizeIfKnown();
double IDF = Math.log10((double)numDocs/DF);
// Note: key is a word, values are in the form of
// (filename=frequency)
for (Text value: values) {
String[] valueParts = value.split("=");
outputValue.set(new String(Integer.parseInt(valueParts[1]) * IDF);
context.write(outputKey, outputValue);