由于某种原因,我的Reducer似乎没有运行。
我的司机
import java.io.File;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class PageRank {
public static void main(String[] args) throws Exception {
PageRank pageRanking = new PageRank();
//In and Out dirs in HDFS
pageRanking.runXmlParsing(args[0], args[1]);
System.out.println("finished");
}
public void runXmlParsing(String inputPath, String outputPath) throws IOException {
Configuration conf = new Configuration();
conf.set(XmlInputFormat.START_TAG_KEY, "<page>");
conf.set(XmlInputFormat.END_TAG_KEY, "</page>");
Job job1 = Job.getInstance(conf);
job1.setJarByClass(PageRank.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
// Our class to parse links from content.
job1.setMapperClass(WikiPageXMLMapper.class);
job1.setReducerClass(WikiLinksReducer.class);
job1.setInputFormatClass(XmlInputFormat.class);
job1.setOutputFormatClass(TextOutputFormat.class);
// Remove output if already exists
FileSystem.getLocal(conf).delete(new Path(outputPath), true);
FileInputFormat.setInputPaths(job1, new Path(inputPath));
FileOutputFormat.setOutputPath(job1, new Path(outputPath));
System.out.println("BEFORE RUN");
try {
job1.waitForCompletion(true);
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void deleteDir(File dir) {
File[] files = dir.listFiles();
for (File myFile: files) {
if (myFile.isDirectory()) {
deleteDir(myFile);
}
myFile.delete();
}
}
}
我的Mapper是
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
public class WikiPageXMLMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
public void map(LongWritable key, Text value, Context output) throws IOException {
String[] titleAndText = parseTitleAndText(value.toString());
String pageString = titleAndText[0];
Text page = new Text(pageString.replace(' ', '_'));
String[] parts = titleAndText[1].split("\\[\\[");
String pages = "!@#$ ";
for (int i = 1; i < parts.length; i++) {
int lastIndexBrackets = parts[i].lastIndexOf("]]");
// This checks and skips the first part of the outer link
if (lastIndexBrackets == -1)
continue;
String insideLinkPlusExtra = parts[i].substring(0, lastIndexBrackets);
int multipleClosingBrackets = insideLinkPlusExtra.indexOf("]]");
String otherPage = insideLinkPlusExtra;
if (multipleClosingBrackets != -1) {
otherPage = insideLinkPlusExtra.substring(0, multipleClosingBrackets);
}
otherPage = otherPage.split("\\|")[0];
otherPage = checkForDuplicates(otherPage, pages);
otherPage = (otherPage.indexOf(":") == -1) ? otherPage : "";
otherPage = (otherPage.indexOf("#") == -1) ? otherPage : "";
otherPage = checkForSubpageLinks(otherPage);
otherPage = checkForRedLink(otherPage);
if (otherPage == "")
continue;
Text oP = new Text(otherPage.replace(' ', '_'));
pages += oP + " ";
// taking each outlink and making it its own key (ingraph)
try {
output.write(new Text(oP), new Text(page));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
// Designate this page as not a redlink
try {
output.write(new Text(page), new Text("!@#$"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
}
我的减速机是:
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WikiLinksReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterator<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException {
System.out.println("REDUCER");
String links = "";
boolean isNotRedLink = false;
System.out.println("Starting reduce");
// Brett concern (and zach's): if n pages link to a redlink
// we will iterate n times and it could be wasteful
while(values.hasNext()){
String v = values.next().toString();
// Check first outlink is not #, if so, it is a redlink
if (v.equals("!@#$")) {
isNotRedLink = true;
continue;
} else {
links += v;
continue;
}
}
// If the key is not a redlink, send it to the output
if (isNotRedLink) {
try {
output.write(key, new Text(links));
output.write(key, new Text("TESTING!"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(links);
} else {
System.out.println(output);
try {
output.write(key, new Text("BLEG"));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(key + " IS A RED LINK");
return;
}
}
}
我的Reducer中的System.out.println(...)
没有输出到控制台,程序的输出(它在我的硬盘上留下的文件)只有Mapper的结果。
答案 0 :(得分:0)
我觉得很傻。我在Iterable
行中尝试了Iterator
而不是public void reduce(Text key, Iterable<Text> values, org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context output) throws IOException, InterruptedException {
,我的问题就解决了。