我被困在单独的每个文件wordcount结果中1行。 我希望输出所有文件结果为1个文件,每个文件结果代表1行。
预期的output.txt格式
file1 1, 2, 3, 4, …, 100
file2 5, 2, 9, 6, …, 30
目前输出结果 每个文件的wordcount结果总结在一起
file1 123,22,31,...,100
file2 123,22,31,...,100
run()的
MultipleInputs.addInputPath(job,in_path1,TextInputFormat.class,Map.class);
MultipleInputs.addInputPath(job,in_path2,TextInputFormat.class,Map.class);
地图
context.write(new Text("file1"),output);
context.write(new Text("file2"),output);
减少
context.write(new Text("file1"),new Text(sp.toString()));
context.write(new Text("file2"),new Text(sp.toString()));
地图
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
private static final HashMap<String, Object> counter = new HashMap<>();
private Text output = new Text();
private String mapToString(HashMap<String, Object> map) {
StringBuilder sb = new StringBuilder();
Iterator<Entry<String, Object>> iter = map.entrySet().iterator();
while (iter.hasNext()) {
Entry<String, Object> entry = iter.next();
sb.append(entry.getKey());
sb.append('=');
sb.append(entry.getValue().toString());
if (iter.hasNext()) {
sb.append(';').append(' ');
}
}
return sb.toString();
}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// TODO: Get filename
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (!counter.containsKey(token)) {
counter.put(token, 1);
} else {
counter.put(token, (Integer) counter.get(token) + 1);
}
}
output.set(mapToString(counter));
context.write(new Text("filename1"), output);
}
}
减少
public static class Reduce extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int number = 0;
System.out.println(key);
for (Text val : values) { // val line record
String[] input = val.toString().split(";\\s");
for (int i = 0; i < input.length; i++) {
String[] temp = input[i].split("=");
String topValue = temp[0];
topValue = temp[0].replaceAll("[^a-zA-Z0-9]", "");
topValue = topValue.toLowerCase();
if (resultMap.containsKey(topValue)) {
int original = resultMap.get(topValue);
int sum = original + Integer.parseInt(temp[1]);
resultMap.put(topValue, sum);
}
}
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
StringBuilder sp = new StringBuilder();
System.out.println("MapSize: " + resultMap);
int i = 0;
Iterator iterator = resultMap.entrySet().iterator();
while (iterator.hasNext()) {
Entry me2 = (Entry) iterator.next();
//System.out.println("key : " + me2.getKey());
sp.append(me2.getKey());
sp.append(":");
sp.append(me2.getValue());
System.out.println(me2.getValue());
sp.append(",");
}
context.write(new Text("file1"), new Text(sp.toString()));
context.write(new Text("file2"), new Text(sp.toString()));
}
}
我卡在两个文件字数组合在一起。我希望在1行中打印每个文件字数。