Hadoop,倒排索引,复合

时间:2019-03-04 21:43:17

标签: java hadoop

我正在尝试为以下问题实现倒排索引:给我的目录中有X数量的文件,我需要产生以下格式的输出:Term,file0:count; file1:计数

我使用词对方法来实现,该方法使用(term,file)作为关键字,将其计为值。

我的词对课:

public class wordpair implements Writable,WritableComparable<wordpair> {
//Compoiste key apply in this method
//stucture of compostie key (word,fileName)
private Text word;
private Text fileName;
private String space = " ";

public wordpair(Text word,Text fileName) {
    this.word = word;
    this.fileName = fileName;
}

public wordpair(String word, String fileName) {
    this(new Text(word), new Text(fileName));
}

public wordpair() {
    this.word = new Text();
    this.fileName = new Text();
}

public void setwordpair(Text word, Text fileName){
    this.word = word;
    this.fileName = fileName;
}

@Override
public int compareTo(wordpair other) {                         // A compareTo B
    int returnVal = this.word.compareTo(other.getWord());      // return -1: A < B
    return returnVal;
}

public static wordpair read(DataInput in) throws IOException {
    wordpair wordpair = new wordpair();
    wordpair.readFields(in);
    return wordpair;
}

@Override
public void write(DataOutput out) throws IOException {
    word.write(out);
    fileName.write(out);
}

@Override
public void readFields(DataInput in) throws IOException {
    word.readFields(in);
    fileName.readFields(in);
}

@Override
public String toString() {
    return ""+word+""  +""+space+""+""+fileName+"";
}

@Override
public boolean equals(Object o) {
    if (this == o) return true;
    if (o == null || getClass() != o.getClass()) return false;

    wordpair wordpair = (wordpair) o;

    if (word != null ? !word.equals(wordpair.word) : wordpair.word != null) return false;

    return true;
}

@Override
public int hashCode() {
    int result = (word != null) ? word.hashCode() : 0;
    //result = 163 * result + ((neighbor != null) ? neighbor.hashCode() : 0);
    return result % 3;
}

public void setWord(String word){
    this.word.set(word);
}

public Text getWord() {
    return word;
}

public void setFileName(String fileName){
    this.fileName.set(fileName);
}

public Text getfileName(){
    return fileName;
}

}

映射器和减速器

public class invertedindex {

        public static class InvertedMapper extends Mapper<LongWritable, Text, wordpair, IntWritable> {
        private wordpair wordpair = new wordpair();
        private Text word = new Text();
        private Text fileName = new Text();
        private IntWritable ONE = new IntWritable(1);
        private IntWritable totalCount = new IntWritable();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //Get the file name using context.getInput spilt method
            String name = ((FileSplit)context.getInputSplit()).getPath().getName();

            //save each token to wordpair
            String[] tokens = value.toString().split("\\s+");          // split the words using spaces
            if (tokens.length > 1) {
                for (int i = 0; i < tokens.length; i++) {
                        tokens[i] = tokens[i].replaceAll("\\W+","");   // remove all non-word characters

                        if(tokens[i].equals("")){
                            continue;
                        }

                        word.set(tokens[i]);
                        fileName.set(name);

                        //create compostie key with (word filename)
                        wordpair.setwordpair(word,fileName);

                        //emit key value pair
                        context.write(wordpair,ONE);
                }
            }
        }
    }

    public static class InvertedReducer extends Reducer<wordpair, IntWritable, Text, Text> {
        private IntWritable totalCount = new IntWritable();

        @Override
        protected void reduce(wordpair key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            //initialize hash mp
            Map<String,Integer> map = new HashMap<String,Integer>();
            String fileName = key.getfileName().toString(); //get fileName
            String word = key.getWord().toString();         //get individual term
            int count = 0;
            for (IntWritable value : values) {
                if(map !=null & map.get(fileName) != null){
                    count = map.get(fileName);
                    map.put(fileName,++count);
                }else{
                    map.put(fileName,1);
                }
                 }
            //totalCount.set(count);
            context.write(key.getWord(),new Text(map.toString()));

        }
    }

public static void main(String[] args) throws IOException,InterruptedException,ClassNotFoundException {

    Job job = Job.getInstance(new Configuration());
    job.setJarByClass(invertedindex.class);
    job.setJobName("invertedindex");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(InvertedMapper.class);
    job.setReducerClass(InvertedReducer.class);
    //job.setCombinerClass(PairsReducer.class);
    //job.setPartitionerClass(WordPairPartitioner.class);
    job.setNumReduceTasks(3);

    job.setOutputKeyClass(wordpair.class);
    job.setOutputValueClass(IntWritable.class);
    System.exit(job.waitForCompletion(true) ? 0 : 1);

    }
}

在reducer中,我试图构建一个包含fileName和(term)count的哈希图。但是,它总是以错误的答案结尾。例如,文件0的行a b c d和文件1 a b c c。答案是(file0 = 2),这不是我期望的(file0 = 1,file1 = 1)。

0 个答案:

没有答案