Map输出的键值对的丢失

时间:2013-05-02 11:51:14

标签: java linux hadoop mapreduce

我编写了hadoop程序,在该程序中,映射文件文件hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input/paths.txt的输入与本地文件系统的编写方式(在集群的所有计算机上都是相同的)程序./readwritepaths in一行,并由字符|分区。首先,在映射器中,有一个来自/usr/countcomputers.txt文件的集群的从属节点的读取数量,同样也可以正确读取,通过程序执行来判断。此外,输入文件的内容以值的形式到达映射器的输入并转换为行,通过分隔符|进行分段,并在ArrayList<String> paths中添加接收的方式。

package org.myorg;

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

public class ParallelIndexation {
    public static class Map extends MapReduceBase implements
            Mapper<LongWritable, Text, Text, LongWritable> {
        private final static LongWritable zero = new LongWritable(0);
        private Text word = new Text();

        public void map(LongWritable key, Text value,
                OutputCollector<Text, LongWritable> output, Reporter reporter)
                throws IOException {
            String line = value.toString();
            int CountComputers;
            FileInputStream fstream = new FileInputStream(
                    "/usr/countcomputers.txt");

            BufferedReader br = new BufferedReader(new InputStreamReader(fstream));
            String result=br.readLine();
            CountComputers=Integer.parseInt(result);
            in.close();
            fstream.close();
            System.out.println("CountComputers="+CountComputers);
            ArrayList<String> paths = new ArrayList<String>();
            StringTokenizer tokenizer = new StringTokenizer(line, "|");
            while (tokenizer.hasMoreTokens()) {
                paths.add(tokenizer.nextToken());
            }

然后,为了检查,我将ArrayList<String> paths元素的值取出到/export/hadoop-1.0.1/bin/readpathsfromdatabase.txt文件中,其中的内容在下面给出,并说明ArrayList<String> paths填充的正确性。

            PrintWriter zzz = null;
            try
            {
                    zzz = new PrintWriter(new FileOutputStream("/export/hadoop-1.0.1/bin/readpathsfromdatabase.txt"));
            }
            catch(FileNotFoundException e)
            {
                    System.out.println("Error");
                    System.exit(0);
            }
            for (int i=0; i<paths.size(); i++)
            {
                    zzz.println("paths[" + i + "]=" + paths.get(i) + "\n");
            }
            zzz.close();

然后通过字符\n连接这些方法,并按String[] ConcatPaths = new String [CountComputers]对数组中的连接结果进行记录。

        String[] ConcatPaths = new String[CountComputers];
        int NumberOfElementConcatPaths = 0;
        if (paths.size() % CountComputers == 0) {
            for (int i = 0; i < CountComputers; i++) {
                ConcatPaths[i] = paths.get(NumberOfElementConcatPaths);
                NumberOfElementConcatPaths += paths.size() / CountComputers;
                for (int j = 1; j < paths.size() / CountComputers; j++) {
                    ConcatPaths[i] += "\n"
                            + paths.get(i * paths.size() / CountComputers
                                    + j);
                }
            }
        } else {
            NumberOfElementConcatPaths = 0;
            for (int i = 0; i < paths.size() % CountComputers; i++) {
                ConcatPaths[i] = paths.get(NumberOfElementConcatPaths);
                NumberOfElementConcatPaths += paths.size() / CountComputers
                        + 1;
                for (int j = 1; j < paths.size() / CountComputers + 1; j++) {
                    ConcatPaths[i] += "\n"
                            + paths.get(i
                                    * (paths.size() / CountComputers + 1)
                                    + j);
                }
            }
            for (int k = paths.size() % CountComputers; k < CountComputers; k++) {
                ConcatPaths[k] = paths.get(NumberOfElementConcatPaths);
                NumberOfElementConcatPaths += paths.size() / CountComputers;
                for (int j = 1; j < paths.size() / CountComputers; j++) {
                    ConcatPaths[k] += "\n"
                            + paths.get((k - paths.size() % CountComputers)
                                    * paths.size() / CountComputers
                                    + paths.size() % CountComputers
                                    * (paths.size() / CountComputers + 1)
                                    + j);
                }
            }
        }

我还将数组单元String[] ConcatPaths取出到/export/hadoop-1.0.1/bin/concatpaths.txt文件以检查连接的正确性。下面收到并给出的这个文件的文本也说明了以前操作阶段的正确性。

        PrintWriter zzz1 = null;
        try
        {
                    zzz1 = new PrintWriter(new FileOutputStream("/export/hadoop-1.0.1/bin/concatpaths.txt"));
        }
        catch(FileNotFoundException e)
        {
                    System.out.println("Error");
                    System.exit(0);
        }
        for (int i = 0; i < ConcatPaths.length; i++) 
        {
                    zzz1.println("ConcatPaths[" + i + "]=" + ConcatPaths[i] + "\n");    
        }
        zzz1.close();

在映射器阵列单元String[] ConcatPaths的输出上 - 连接的路径到达。

        for (int i = 0; i < ConcatPaths.length; i++) 
        {
            word.set(ConcatPaths[i]);
            output.collect(word, zero);
        }

在Reducer中,输入键的分区部分通过分隔符\nArrayList<String> ProcessedPaths中接收方式的记录。

public static class Reduce extends MapReduceBase implements
        Reducer<Text, IntWritable, Text, LongWritable> {
    public native long Traveser(String Path);

    public native void Configure(String Path);

    public void reduce(Text key, Iterator<IntWritable> value,
            OutputCollector<Text, LongWritable> output, Reporter reporter)
            throws IOException {
        long count=0;
        String line = key.toString();
        ArrayList<String> ProcessedPaths = new ArrayList<String>();
        StringTokenizer tokenizer = new StringTokenizer(line, "\n");
        while (tokenizer.hasMoreTokens()) {
            ProcessedPaths.add(tokenizer.nextToken());
        }

为了验证分离方式的分离,我将元素从连接的键中取出 ArrayList<String> ProcessedPaths文件中的/export/hadoop-1.0.1/bin/ProcessedPaths.txt。两个下级节点上的此文件的内容同样出现,并且表示与第二个连接密钥分开的方式,尽管在mepper的输出上有两个不同的连接方式。但最令人惊讶的是 - 由于后续行的reducer实现了对接收方式的文件索引,即从数据库表中的这些文件中引入单词,只有一个文件 - /export/hadoop-1.0.1/bin/error.txt属于第一个连接密钥已编入索引。

        PrintWriter zzz2 = null;
            try
            {
                    zzz2 = new PrintWriter(new FileOutputStream("/export/hadoop-1.0.1/bin/ProcessedPaths.txt"));
            }
            catch(FileNotFoundException e)
            {
                    System.out.println("Error");
                    System.exit(0);
            }
            for (int i=0; i < ProcessedPaths.size(); i++)
            {
                zzz2.println("ProcessedPaths[" + i + "]=" + ProcessedPaths.get(i) + "\n");
            }
            zzz2.close();           
        Configure("/etc/nsindexer.conf");
        for (int i = 0; i < ProcessedPaths.size(); i++) {
            count = Traveser(ProcessedPaths.get(i));
        }
        output.collect(key, new LongWritable(count));

程序的执行发生在以下脚本bash的帮助下

#!/bin/bash
cd /export/hadoop-1.0.1/bin
./hadoop namenode -format
./start-all.sh
./hadoop fs -rmr hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/output
./hadoop fs -rmr hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input
./hadoop fs -mkdir hdfs://192.168.1.8:7000/export/hadoop-1.0.1/input
./readwritepaths
sleep 120
./hadoop fs -put /export/hadoop-1.0.1/bin/input/paths.txt hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input/paths.txt 1> copyinhdfs.txt 2>&1
./hadoop jar /export/hadoop-1.0.1/bin/ParallelIndexation.jar org.myorg.ParallelIndexation /export/hadoop-1.0.1/bin/input /export/hadoop-1.0.1/bin/output -D mapred.map.tasks=1 -D mapred.reduce.tasks=2 1> resultofexecute.txt 2>&1

根据最后一个命令,mepper应为1。但是,尽管这些文件/export/hadoop- 1.0.1/bin/readpathsfromdatabase.txt/export/hadoop-1.0.1/bin/concatpaths.txt出现在两个下级节点上。我给出了上述文件的内容 hdfs://192.168.1.8:7000/export/hadoop-1.0.1/bin/input/paths.txt

/export/hadoop-1.0.1/bin/error.txt|/root/nexenta_search/nsindexer.conf|/root/nexenta_search/traverser.c|/root/nexenta_search/buf_read.c|/root/nexenta_search/main.c|/root/nexenta_search/avl_tree.c|

/export/hadoop-1.0.1/bin/readpathsfromdatabase.txt

paths[0]=/export/hadoop-1.0.1/bin/error.txt

paths[1]=/root/nexenta_search/nsindexer.conf

paths[2]=/root/nexenta_search/traverser.c

paths[3]=/root/nexenta_search/buf_read.c

paths[4]=/root/nexenta_search/main.c

paths[5]=/root/nexenta_search/avl_tree.c

/export/hadoop-1.0.1/bin/concatpaths.txt

ConcatPaths[0]=/export/hadoop-1.0.1/bin/error.txt
/root/nexenta_search/nsindexer.conf
/root/nexenta_search/traverser.c

ConcatPaths[1]=/root/nexenta_search/buf_read.c
/root/nexenta_search/main.c
/root/nexenta_search/avl_tree.c

/export/hadoop-1.0.1/bin/ProcessedPaths.txt

ProcessedPaths[0]=/root/nexenta_search/buf_read.c

ProcessedPaths[1]=/root/nexenta_search/main.c

ProcessedPaths[2]=/root/nexenta_search/avl_tree.c

关于这一切,我想问3个问题:

  1. 为什么两个节点上的/export/hadoop-1.0.1/bin/ProcessedPaths.txt文件的文本是相同的,如此处所提供的那样?
  2. 为什么只有一个文件 - /export/hadoop-1.0.1/bin/error.txt被编入索引?
  3. 为什么在两个下级节点上都执行了映射器?

0 个答案:

没有答案