我正在尝试编写Java代码,而不是读取所有的cr字段(信息重审中的热门话题),以便进行标记化,计算总标记,找到50个频繁的单词并删除预定义的停用词。
除了StopWordsRemoval
方法(代码中的最后一个)之外,它的工作原理,
它不会根据需要更改输出,此方法之前/之后的输出相同!
import java.io.*;
import java.util.*;
public class Information_Retrieval_Hw1 {
//Global variables
public static BufferedReader buffer;
public static Hashtable<String, Integer> wordList = new Hashtable<String, Integer>();
public static ArrayList<Hashtable <String,Integer>> fileMap = new ArrayList<Hashtable<String,Integer>>();
public static Set<String> tagNames = new HashSet<String>();
//public static ArrayList<Map.Entry<String, Integer>> list;
public static int documentsCount = 0;
public static int totalTokens = 0;
public static int uniqueWords = 0;
public static int tagCount = 0;
public static int singleOccureneWords = 0;
public static ArrayList<Map.Entry<String, Integer>> sortedList;
public Information_Retrieval_Hw1() {
// TODO Auto-generated constructor stub
}
public static void main(String[] args) throws IOException {
String cranfield = "/Users/Manal/Desktop/semster1/IR/assigenment 1/cranfieldDocs";
File cranfieldFiles = new File(cranfield);
ReadFile(cranfieldFiles);
System.out.println("Total number of documents: " + fileMap.size());
//Calculate total number of tokens
totalTokens = CalculateNumberOfTokens(wordList);
System.out.println("Total number Of words = " + totalTokens);
//Calculate number of unique words
uniqueWords = CalculateUniqueWords(wordList);
System.out.println("Total number Of distinct words = " + uniqueWords);
//Calculate number of unique words
singleOccureneWords = CalculateSingleOccurenceWords(wordList);
System.out.println("Total number Of words that occur only once = " + singleOccureneWords);
//Find the 30 most frequent words
FindFiftyMostFrequentWords(wordList);
StopWordsRemoval (cranfieldFiles,wordList);
//reprint all information after removing stopword;
System.out.println("\n***********************************\nAfter removing stop words \n***********************************\n");
//Calculate total number of tokens
totalTokens = CalculateNumberOfTokens(wordList);
System.out.println("Total number Of words = " + totalTokens);
//Calculate number of unique words
uniqueWords = CalculateUniqueWords(wordList);
System.out.println("Total number Of distinct words = " + uniqueWords);
//Calculate number of unique words
singleOccureneWords = CalculateSingleOccurenceWords(wordList);
System.out.println("Total number Of words that occur only once = " + singleOccureneWords);
//Find the 30 most frequent words
FindFiftyMostFrequentWords(wordList);
}
public static void ReadFile(File cranfieldFiles) throws IOException{
for (File file: cranfieldFiles.listFiles())
{
//read files recursively if path contains folder
if(file.isDirectory())
{
ReadFile(file);
}
else
{
documentsCount++;
try
{
buffer = new BufferedReader(new FileReader(file));
}
catch (FileNotFoundException e)
{
System.out.println("File not Found");
}
//find the tags and their count
tagCount = tagCount + TagHandler(file, tagNames);
//find words in the cranfield
TokenHandler(file, tagNames);
}
}
}
public static int TagHandler(File file, Set<String> tagNames) throws IOException
{
String line;
int tag_count = 0;
buffer = new BufferedReader(new FileReader(file));
while((line = buffer.readLine()) != null)
{
/*
* If the line contains a '<', it is considered a tag and tag_count is incremented.
*/
if(line.contains("<"))
{
tag_count++;
String b = line.replaceAll("[<*>/]", "");
tagNames.add(b);
}
}
tag_count/=2; //Since each tag represent the beginning and the end, we divide it by two to get the actual count.
return tag_count;
}
public static void TokenHandler(File file, Set<String> tagNames) throws IOException
{
String line;
String words[];
buffer = new BufferedReader(new FileReader(file));
Hashtable<String, Integer> tempMap = new Hashtable<String, Integer>();
while((line = buffer.readLine()) != null)
{
String s1 = line.replaceAll("[^a-zA-Z.]+"," "); //Replace everything that is not an alphabet with a blank space.
String s2 = s1.replaceAll("[.]", "");//Replace words with . (eg U.S) as 1 word
words = s2.split(" ");
for(String word : words)
{
//Handle the tags properly
if(!tagNames.contains(word) && !word.equals(""))
{
word = word.toLowerCase(); // Converts all words to lower case.
//add word if it isn't added already
if(!wordList.containsKey(word))
{
//first occurance of this word
wordList.put(word, 1);
//Following is to compute the unique words in each document
if(!tempMap.containsKey(word))
{
tempMap.put(word,1);
}
else
{
tempMap.put(word, tempMap.get(word)+ 1);
}
}
else
{
//Increament the count of that word
wordList.put(word, wordList.get(word) + 1);
if(!tempMap.containsKey(word))
{
tempMap.put(word,1);
}
else
{
tempMap.put(word, tempMap.get(word)+ 1);
}
}
}
}
}
//Add count to file map and after reading every file
fileMap.add(tempMap);
}
//Function to find the total number of tokens in the cranfield database
public static int CalculateNumberOfTokens(Hashtable<String, Integer> myWordList)
{
int noOfTokens = 0;
for (Integer value: myWordList.values())
{
noOfTokens = noOfTokens + value;
}
return noOfTokens;
}
public static int CalculateUniqueWords(Hashtable<String, Integer> myWordList)
{
return myWordList.size();
}
public static int CalculateSingleOccurenceWords(Hashtable<String, Integer> myWordList)
{
int count = 0;
for (Integer value: myWordList.values())
{
if(value == 1)
{
count++;
}
}
return count;
}
//Sorting the hashTable
public static ArrayList<Map.Entry<String, Integer>> SortHashTable(Hashtable<String, Integer> myWordList)
{
ArrayList<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(myWordList.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>(){
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}});
return list;
}
public static void FindFiftyMostFrequentWords(Hashtable<String, Integer> myWordList)
{
//Sort the hashtable based on value
sortedList = SortHashTable(myWordList);
System.out.println("The 50 most frequent words are: ");
for(int i=0;i<50;i++)
{
System.out.println("\t" + (i+1) + "." + " " + sortedList.get(i));
}
}
public static Hashtable<String, Integer> StopWordsRemoval (File file, Hashtable<String, Integer> wordList) throws IOException {
int k=0,j;
String sCurrentLine;
String[] stopwords = new String[2000];
try
{
FileReader fr=new FileReader("/Users/Manal/Desktop/semster1/IR/assigenment 1/xid-10624858_1.txt");
BufferedReader br= new BufferedReader(fr);
while ((sCurrentLine = br.readLine()) != null){
stopwords[k]=sCurrentLine;
k++;
}
Set<String> keys = wordList.keySet();
for(String key: keys)
{
for(j = 0; j < k; j++)
{
if(wordList.keySet().equals(stopwords[j]))
wordList.remove(key);
}
}
}
catch(Exception ex)
{System.out.println(ex);}
return wordList;
}
}
答案 0 :(得分:0)
我认为这是代码中的问题
if(wordList.keySet().equals(stopwords[j]))
你正在做的是检查keySet是否等于单词(keySet()返回Set)与keySet是否包含单词。试试这个:
if(wordList.keySet().contains(stopwords[j]))
如果这样可以解决您的问题,请告诉我。