java中的停用词删除方法无法正常工作

时间:2016-09-08 11:28:54

标签: java information-retrieval stop-words

我正在尝试编写Java代码,而不是读取所有的cr字段(信息重审中的热门话题),以便进行标记化,计算总标记,找到50个频繁的单词并删除预定义的停用词。 除了StopWordsRemoval方法(代码中的最后一个)之外,它的工作原理, 它不会根据需要更改输出,此方法之前/之后的输出相同!

你可以帮我搞清楚问题是什么吗? 这是我在Java中的第一个代码:(

import java.io.*;
import java.util.*;

public class Information_Retrieval_Hw1 {

    //Global variables
    public static BufferedReader buffer;
    public static Hashtable<String, Integer> wordList = new Hashtable<String, Integer>();
    public static ArrayList<Hashtable <String,Integer>> fileMap = new ArrayList<Hashtable<String,Integer>>();
    public static Set<String> tagNames = new HashSet<String>();
    //public static ArrayList<Map.Entry<String, Integer>> list;

    public static int documentsCount = 0;
    public static int totalTokens = 0;
    public static int uniqueWords = 0;
    public static int tagCount = 0;
    public static int singleOccureneWords = 0;

    public static ArrayList<Map.Entry<String, Integer>> sortedList;


    public Information_Retrieval_Hw1() {
        // TODO Auto-generated constructor stub
    }

    public static void main(String[] args) throws IOException {

        String cranfield = "/Users/Manal/Desktop/semster1/IR/assigenment 1/cranfieldDocs";
        File cranfieldFiles = new File(cranfield);
        ReadFile(cranfieldFiles);

        System.out.println("Total number of documents: " + fileMap.size());

        //Calculate total number of tokens
        totalTokens = CalculateNumberOfTokens(wordList);
        System.out.println("Total number Of words = " + totalTokens);

        //Calculate number of unique words
        uniqueWords = CalculateUniqueWords(wordList);
        System.out.println("Total number Of distinct words = " + uniqueWords);

        //Calculate number of unique words
        singleOccureneWords = CalculateSingleOccurenceWords(wordList);
        System.out.println("Total number Of words that occur only once = " + singleOccureneWords);

        //Find the 30 most frequent words
        FindFiftyMostFrequentWords(wordList);

        StopWordsRemoval (cranfieldFiles,wordList);
        //reprint all information after removing stopword;

        System.out.println("\n***********************************\nAfter removing stop words \n***********************************\n");

        //Calculate total number of tokens
        totalTokens = CalculateNumberOfTokens(wordList);
        System.out.println("Total number Of words = " + totalTokens);

        //Calculate number of unique words
        uniqueWords = CalculateUniqueWords(wordList);
        System.out.println("Total number Of distinct words = " + uniqueWords);

        //Calculate number of unique words
        singleOccureneWords = CalculateSingleOccurenceWords(wordList);
        System.out.println("Total number Of words that occur only once = " + singleOccureneWords);

        //Find the 30 most frequent words
        FindFiftyMostFrequentWords(wordList);

    }

    public static void ReadFile(File cranfieldFiles) throws IOException{
        for (File file: cranfieldFiles.listFiles())
        {
            //read files recursively if path contains folder
            if(file.isDirectory())
            {
                ReadFile(file);
            }

            else
            {
                documentsCount++;
                try
                {
                    buffer = new BufferedReader(new FileReader(file));
                }
                catch (FileNotFoundException e)
                {
                    System.out.println("File not Found");

                }
                //find the tags and their count
                tagCount = tagCount + TagHandler(file, tagNames);
                //find words in the cranfield
                TokenHandler(file, tagNames);

            }
        }


    }

    public static int TagHandler(File file, Set<String> tagNames) throws IOException
    {
        String line;
        int tag_count = 0;


        buffer = new BufferedReader(new FileReader(file));
        while((line = buffer.readLine()) != null)
        {
        /*
         * If the line contains a '<', it is considered a tag and tag_count is incremented.
         */
            if(line.contains("<"))
            {
                tag_count++;

                String b = line.replaceAll("[<*>/]", "");
                tagNames.add(b);
            }

        }
        tag_count/=2; //Since each tag represent the beginning and the end, we divide it by two to get the actual count.
        return tag_count;
    }

    public static void TokenHandler(File file, Set<String> tagNames) throws IOException
    {
        String line;
        String words[];

        buffer = new BufferedReader(new FileReader(file));
        Hashtable<String, Integer> tempMap = new Hashtable<String, Integer>();

        while((line = buffer.readLine()) != null)
        {

            String s1 = line.replaceAll("[^a-zA-Z.]+"," "); //Replace everything that is not an alphabet with a blank space.
            String s2 = s1.replaceAll("[.]", "");//Replace words with . (eg U.S) as 1 word
            words = s2.split(" ");

            for(String word : words)
            {
                //Handle the tags properly
                if(!tagNames.contains(word) && !word.equals(""))
                {
                    word = word.toLowerCase(); // Converts all words to lower case.

                    //add word if it isn't added already
                    if(!wordList.containsKey(word))
                    {
                        //first occurance of this word
                        wordList.put(word, 1);

                        //Following is to compute the unique words in each document
                        if(!tempMap.containsKey(word))
                        {
                            tempMap.put(word,1);

                        }
                        else
                        {
                            tempMap.put(word, tempMap.get(word)+ 1);

                        }
                    }
                    else
                    {
                        //Increament the count of that word
                        wordList.put(word, wordList.get(word) + 1);
                        if(!tempMap.containsKey(word))
                        {
                            tempMap.put(word,1);

                        }
                        else
                        {
                            tempMap.put(word, tempMap.get(word)+ 1);
                        }
                    }
                }
            }
        }

        //Add count to file map and  after reading every file
        fileMap.add(tempMap);
    }

    //Function to find the total number of tokens in the cranfield database

    public static int CalculateNumberOfTokens(Hashtable<String, Integer> myWordList)
    {
        int noOfTokens = 0;

        for (Integer value: myWordList.values())
        {
            noOfTokens = noOfTokens + value;
        }
        return noOfTokens;
    }

    public static int CalculateUniqueWords(Hashtable<String, Integer> myWordList)
    {

        return myWordList.size();
    }

    public static int CalculateSingleOccurenceWords(Hashtable<String, Integer> myWordList)
    {
        int count = 0;

        for (Integer value: myWordList.values())
        {
            if(value == 1)
            {
                count++;
            }
        }
        return count;
    }

    //Sorting the hashTable
    public static ArrayList<Map.Entry<String, Integer>> SortHashTable(Hashtable<String, Integer> myWordList)
    {
        ArrayList<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(myWordList.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<String, Integer>>(){
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                return o2.getValue().compareTo(o1.getValue());
            }});
        return list;
    }

    public static void FindFiftyMostFrequentWords(Hashtable<String, Integer> myWordList)
    {
        //Sort the hashtable based on value

        sortedList = SortHashTable(myWordList);
        System.out.println("The 50 most frequent words are: ");
        for(int i=0;i<50;i++)
        {
            System.out.println("\t" + (i+1) + "." + " " + sortedList.get(i));
        }
    }

    public static Hashtable<String, Integer> StopWordsRemoval (File file, Hashtable<String, Integer> wordList) throws IOException {
        int k=0,j;
        String sCurrentLine;
        String[] stopwords = new String[2000];
        try
        {
            FileReader fr=new FileReader("/Users/Manal/Desktop/semster1/IR/assigenment 1/xid-10624858_1.txt");
            BufferedReader br= new BufferedReader(fr);
            while ((sCurrentLine = br.readLine()) != null){
                stopwords[k]=sCurrentLine;
                k++;
            }
            Set<String> keys = wordList.keySet();
            for(String key: keys)
            {
                for(j = 0; j < k; j++)
                {
                    if(wordList.keySet().equals(stopwords[j]))
                        wordList.remove(key);
                }
            }
        }
        catch(Exception ex)
        {System.out.println(ex);}

        return wordList;
    }
}

1 个答案:

答案 0 :(得分:0)

我认为这是代码中的问题

if(wordList.keySet().equals(stopwords[j]))

你正在做的是检查keySet是否等于单词(keySet()返回Set)与keySet是否包含单词。试试这个:

if(wordList.keySet().contains(stopwords[j]))

如果这样可以解决您的问题,请告诉我。