我编写了hadoop程序,在该程序中,映射文件文件hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input/paths.txt
的输入与本地文件系统的编写方式(在集群的所有计算机上都是相同的)程序./readwritepaths
in一行,并由字符|
分区。首先,在映射器中,有一个来自/usr/countcomputers.txt
文件的集群的从属节点的读取数量,同样也可以正确读取,通过程序执行来判断。此外,输入文件的内容以值的形式到达映射器的输入并转换为行,通过分隔符|
进行分段,并在ArrayList<String> paths
中添加接收的方式。
package org.myorg;
import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class ParallelIndexation {
public static class Map extends MapReduceBase implements
Mapper<LongWritable, Text, Text, LongWritable> {
private final static LongWritable zero = new LongWritable(0);
private Text word = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
int CountComputers;
FileInputStream fstream = new FileInputStream(
"/usr/countcomputers.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(fstream));
String result=br.readLine();
CountComputers=Integer.parseInt(result);
in.close();
fstream.close();
System.out.println("CountComputers="+CountComputers);
ArrayList<String> paths = new ArrayList<String>();
StringTokenizer tokenizer = new StringTokenizer(line, "|");
while (tokenizer.hasMoreTokens()) {
paths.add(tokenizer.nextToken());
}
然后,为了检查,我将ArrayList<String> paths
元素的值取出到/export/hadoop-1.0.1/bin/readpathsfromdatabase.txt
文件中,其中的内容在下面给出,并说明ArrayList<String> paths
填充的正确性。
PrintWriter zzz = null;
try
{
zzz = new PrintWriter(new FileOutputStream("/export/hadoop-1.0.1/bin/readpathsfromdatabase.txt"));
}
catch(FileNotFoundException e)
{
System.out.println("Error");
System.exit(0);
}
for (int i=0; i<paths.size(); i++)
{
zzz.println("paths[" + i + "]=" + paths.get(i) + "\n");
}
zzz.close();
然后通过字符\n
连接这些方法,并按String[] ConcatPaths = new String [CountComputers]
对数组中的连接结果进行记录。
String[] ConcatPaths = new String[CountComputers];
int NumberOfElementConcatPaths = 0;
if (paths.size() % CountComputers == 0) {
for (int i = 0; i < CountComputers; i++) {
ConcatPaths[i] = paths.get(NumberOfElementConcatPaths);
NumberOfElementConcatPaths += paths.size() / CountComputers;
for (int j = 1; j < paths.size() / CountComputers; j++) {
ConcatPaths[i] += "\n"
+ paths.get(i * paths.size() / CountComputers
+ j);
}
}
} else {
NumberOfElementConcatPaths = 0;
for (int i = 0; i < paths.size() % CountComputers; i++) {
ConcatPaths[i] = paths.get(NumberOfElementConcatPaths);
NumberOfElementConcatPaths += paths.size() / CountComputers
+ 1;
for (int j = 1; j < paths.size() / CountComputers + 1; j++) {
ConcatPaths[i] += "\n"
+ paths.get(i
* (paths.size() / CountComputers + 1)
+ j);
}
}
for (int k = paths.size() % CountComputers; k < CountComputers; k++) {
ConcatPaths[k] = paths.get(NumberOfElementConcatPaths);
NumberOfElementConcatPaths += paths.size() / CountComputers;
for (int j = 1; j < paths.size() / CountComputers; j++) {
ConcatPaths[k] += "\n"
+ paths.get((k - paths.size() % CountComputers)
* paths.size() / CountComputers
+ paths.size() % CountComputers
* (paths.size() / CountComputers + 1)
+ j);
}
}
}
我还将数组单元String[] ConcatPaths
取出到/export/hadoop-1.0.1/bin/concatpaths.txt
文件以检查连接的正确性。下面收到并给出的这个文件的文本也说明了以前操作阶段的正确性。
PrintWriter zzz1 = null;
try
{
zzz1 = new PrintWriter(new FileOutputStream("/export/hadoop-1.0.1/bin/concatpaths.txt"));
}
catch(FileNotFoundException e)
{
System.out.println("Error");
System.exit(0);
}
for (int i = 0; i < ConcatPaths.length; i++)
{
zzz1.println("ConcatPaths[" + i + "]=" + ConcatPaths[i] + "\n");
}
zzz1.close();
在映射器阵列单元String[] ConcatPaths
的输出上 - 连接的路径到达。
for (int i = 0; i < ConcatPaths.length; i++)
{
word.set(ConcatPaths[i]);
output.collect(word, zero);
}
在Reducer中,输入键的分区部分通过分隔符\n
和ArrayList<String> ProcessedPaths
中接收方式的记录。
public static class Reduce extends MapReduceBase implements
Reducer<Text, IntWritable, Text, LongWritable> {
public native long Traveser(String Path);
public native void Configure(String Path);
public void reduce(Text key, Iterator<IntWritable> value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
long count=0;
String line = key.toString();
ArrayList<String> ProcessedPaths = new ArrayList<String>();
StringTokenizer tokenizer = new StringTokenizer(line, "\n");
while (tokenizer.hasMoreTokens()) {
ProcessedPaths.add(tokenizer.nextToken());
}
为了验证分离方式的分离,我将元素从连接的键中取出
ArrayList<String> ProcessedPaths
文件中的/export/hadoop-1.0.1/bin/ProcessedPaths.txt
。两个下级节点上的此文件的内容同样出现,并且表示与第二个连接密钥分开的方式,尽管在mepper的输出上有两个不同的连接方式。但最令人惊讶的是 - 由于后续行的reducer实现了对接收方式的文件索引,即从数据库表中的这些文件中引入单词,只有一个文件 - /export/hadoop-1.0.1/bin/error.txt
属于第一个连接密钥已编入索引。
PrintWriter zzz2 = null;
try
{
zzz2 = new PrintWriter(new FileOutputStream("/export/hadoop-1.0.1/bin/ProcessedPaths.txt"));
}
catch(FileNotFoundException e)
{
System.out.println("Error");
System.exit(0);
}
for (int i=0; i < ProcessedPaths.size(); i++)
{
zzz2.println("ProcessedPaths[" + i + "]=" + ProcessedPaths.get(i) + "\n");
}
zzz2.close();
Configure("/etc/nsindexer.conf");
for (int i = 0; i < ProcessedPaths.size(); i++) {
count = Traveser(ProcessedPaths.get(i));
}
output.collect(key, new LongWritable(count));
程序的执行发生在以下脚本bash的帮助下
#!/bin/bash
cd /export/hadoop-1.0.1/bin
./hadoop namenode -format
./start-all.sh
./hadoop fs -rmr hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/output
./hadoop fs -rmr hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input
./hadoop fs -mkdir hdfs://192.168.1.8:7000/export/hadoop-1.0.1/input
./readwritepaths
sleep 120
./hadoop fs -put /export/hadoop-1.0.1/bin/input/paths.txt hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input/paths.txt 1> copyinhdfs.txt 2>&1
./hadoop jar /export/hadoop-1.0.1/bin/ParallelIndexation.jar org.myorg.ParallelIndexation /export/hadoop-1.0.1/bin/input /export/hadoop-1.0.1/bin/output -D mapred.map.tasks=1 -D mapred.reduce.tasks=2 1> resultofexecute.txt 2>&1
根据最后一个命令,mepper应为1。但是,尽管这些文件/export/hadoop-
1.0.1/bin/readpathsfromdatabase.txt
和/export/hadoop-1.0.1/bin/concatpaths.txt
出现在两个下级节点上。我给出了上述文件的内容
hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input/paths.txt
/export/hadoop-1.0.1/bin/error.txt|/root/nexenta_search/nsindexer.conf|/root/nexenta_search/traverser.c|/root/nexenta_search/buf_read.c|/root/nexenta_search/main.c|/root/nexenta_search/avl_tree.c|
/export/hadoop-1.0.1/bin/readpathsfromdatabase.txt
paths[0]=/export/hadoop-1.0.1/bin/error.txt
paths[1]=/root/nexenta_search/nsindexer.conf
paths[2]=/root/nexenta_search/traverser.c
paths[3]=/root/nexenta_search/buf_read.c
paths[4]=/root/nexenta_search/main.c
paths[5]=/root/nexenta_search/avl_tree.c
/export/hadoop-1.0.1/bin/concatpaths.txt
ConcatPaths[0]=/export/hadoop-1.0.1/bin/error.txt
/root/nexenta_search/nsindexer.conf
/root/nexenta_search/traverser.c
ConcatPaths[1]=/root/nexenta_search/buf_read.c
/root/nexenta_search/main.c
/root/nexenta_search/avl_tree.c
/export/hadoop-1.0.1/bin/ProcessedPaths.txt
ProcessedPaths[0]=/root/nexenta_search/buf_read.c
ProcessedPaths[1]=/root/nexenta_search/main.c
ProcessedPaths[2]=/root/nexenta_search/avl_tree.c
关于这一切,我想问3个问题:
/export/hadoop-1.0.1/bin/ProcessedPaths.txt
文件的文本是相同的,如此处所提供的那样?/export/hadoop-1.0.1/bin/error.txt
被编入索引?