我要问一个严肃的问题。我有一个带有“句子”的文件,文件大小为500MB。由于需要很长时间才能阅读,我为此创建了一个 Hash 并将其保存到另一个文件中(我首先收集了将在我的程序中的单词列表。然后为它们创建了哈希。然后我将其添加到HashMap
中,因此'key'是单词,'value'是散列。我使用此HashMap
我将整个500MB转换为单独的Hash文件)。现在这个Hash是77 MB。此哈希可以表示使用3个字符的任何单词,并为每个单词创建唯一的哈希值。此哈希中的一行表示实际文件中的一个句子。
现在,我要在程序中输入单词列表。程序也会将这些单词转换为 Hash 。然后它将通过我之前解释过的Hash文件(77MB)并查找我输入的单词是否显示在列表中(我正在比较Hashes)。如果出现,那么我得到单词(该单词的哈希指示),将其转换为真实单词。以下是我的计划。
import java.io.IOException;
import java.io.PrintWriter;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import javax.servlet.RequestDispatcher;
/**
*
* @author XXX
*/
public class Analizer extends HttpServlet {
private static final String[] symbols = { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f", "g", "h",
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "!", "@", "#", "$", "%", "^", "&",
"*", "~", "?" };
private HashMap wordMap;
PrintWriter pw;
private HttpServletRequest request;
private HttpServletResponse response;
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException
{
doPost(request,response);
}
public void doPost(HttpServletRequest request, HttpServletResponse response)throws ServletException, IOException
{
this.request = request;
this.response = response;
String[] listOfWords = request.getParameter("wordList").toLowerCase().trim().split("\n"); //Get the List of words
int percentage = Integer.parseInt(request.getParameter("percentage")); // Get the percentage value
double numberOfWordsInProgramHash = 0; //Keep track of how many words in "program" per webpage
int primaryKey = 0; //Store the primary key
StringBuffer hashIndex = new StringBuffer();
LinkedList<DataHolder> storeDataHolders = new LinkedList<DataHolder>();
StringBuilder userListWithoutDuplicates = new StringBuilder();
pw = response.getWriter();
double numberOfKnownWords = 0;
Arrays.sort(listOfWords);
//Remove the duplicated words in user's list
HashSet<String> userDefinedSet = new HashSet<String>();
for(int i=0;i<listOfWords.length;i++)
{
if (!userDefinedSet.contains(listOfWords[i].trim()))
{
userListWithoutDuplicates.append(listOfWords[i].trim());
userListWithoutDuplicates.append(" ");
userDefinedSet.add(listOfWords[i].trim());
//pw.println(listOfWords[i].trim());
}
}
//createHashForUserList(userListWithoutDuplicates);
hashIndex = createHashForUserList(userListWithoutDuplicates);
//Read the Hash File
String str = "";
File inputFile = new File("C:/Users/Yohan/Desktop/Test.txt");
BufferedReader br = new BufferedReader(new FileReader(inputFile));
int pageNumber=0;
while((str=br.readLine())!=null)
{
HashSet<String>hashSet = new HashSet<String>();
ArrayList<String>matchingWordsHolder = new ArrayList<String>();
ArrayList<String>unmatchingWordsHolder = new ArrayList<String>();
int lastIndex = 0;
for(int i=0;i<=str.length();i=i+3)
{
lastIndex = i;
try
{
String stringPiece = str.substring(i, i+3);
// pw.println(stringPiece);
hashSet.add(stringPiece);
}
catch(Exception arr)
{
String stringPiece = str.substring(lastIndex, str.length());
// pw.println(stringPiece);
hashSet.add(stringPiece);
}
}
numberOfWordsInProgramHash = hashSet.size();
//pw.println("HASH sets size: "+numberOfWordsInProgramHash);
//Create the Hash for the user input
String[] finalUserDefinedWordCollection = hashIndex.toString().trim().split(" ");
//Check how many words exists
for(int i=0;i<finalUserDefinedWordCollection.length;i++)
{
if(hashSet.contains(finalUserDefinedWordCollection[i]))
{
matchingWordsHolder.add(finalUserDefinedWordCollection[i]);
//pw.println(finalUserDefinedWordCollection[i]);
hashSet.remove(finalUserDefinedWordCollection[i]);
numberOfKnownWords++;
}
}
//Making a list of words do not exists
Iterator iter = hashSet.iterator();
while(iter.hasNext())
{
unmatchingWordsHolder.add(iter.next().toString());
}
double matchingPercentage = ((numberOfKnownWords/numberOfWordsInProgramHash)*100.0);
//pw.println("Page No: "+pageNumber+" Number Of Matches: "+numberOfKnownWords+" Matching Percentage: "+String.valueOf(matchingPercentage));
//pw.println();
if(matchingPercentage>percentage)
{
DataHolder data = new DataHolder(); //This is a custom class
data.setOriginalHash(str);
data.setPrimaryKey(pageNumber);
StringBuffer matchingWordsStr = new StringBuffer("");
StringBuffer unMatchingWordsStr = new StringBuffer("");
//Populating Strings
for(int m=0;m<matchingWordsHolder.size();m++)
{
Iterator iterInWordMap = wordMap.entrySet().iterator();
while(iterInWordMap.hasNext())
{
Map.Entry mEntry = (Map.Entry)iterInWordMap.next();
if(mEntry.getValue().equals(matchingWordsHolder.get(m)))
{
//out.println(matchingWords.get(m)+" : "+true);
matchingWordsStr.append(mEntry.getKey());
matchingWordsStr.append(",");
}
}
}
data.setMatchingWords(matchingWordsStr);
for(int u=0;u<unmatchingWordsHolder.size();u++)
{
Iterator iterInWordMap = wordMap.entrySet().iterator();
while(iterInWordMap.hasNext())
{
Map.Entry mEntry = (Map.Entry)iterInWordMap.next();
if(mEntry.getValue().equals(unmatchingWordsHolder.get(u)))
{
//out.println(matchingWords.get(m)+" : "+true);
unMatchingWordsStr.append(mEntry.getKey());
unMatchingWordsStr.append(",");
}
}
}
data.setUnmatchingWords(unMatchingWordsStr);
storeDataHolders.add(data);
//pw.write("Record Added to DataHolder");
}
numberOfKnownWords = 0;
primaryKey++;
pageNumber++;
}
//Grab the first 1000 items from LinkedList
List<DataHolder> firstTenItems = new ArrayList<DataHolder>();
for(int i=0;i<storeDataHolders.size();i++)
{
firstTenItems.add(storeDataHolders.get(i));
if(i==9)
{
break;
}
}
//Convert the Hashed words back to real words
request.setAttribute("list", firstTenItems);
RequestDispatcher dispatch = request.getRequestDispatcher("index.jsp");
dispatch.forward(request, response);
}
/*
* This method is responsible for creating the Hash List for the entire list of words
* we have, and creating the Hash for the User desined word list
* */
private StringBuffer createHashForUserList(StringBuilder userListWithoutDuplicates)
{
System.out.println("Calling createHashForUserList()");
createWordNumberingMap();
String[]finalWordHolder = userListWithoutDuplicates.toString().split(" ");
StringBuffer hashIndex = new StringBuffer();
//Navigate through text and create the Hash
for(int arrayCount=0;arrayCount<finalWordHolder.length;arrayCount++)
{
if(wordMap.containsKey(finalWordHolder[arrayCount]))
{
hashIndex.append((String)wordMap.get(finalWordHolder[arrayCount]));
hashIndex.append(" ");
}
}
return hashIndex;
}
//Hash Generating Algorithm
public static String getSequence(final int i) {
return symbols[i / (symbols.length * symbols.length)] + symbols[(i / symbols.length) % symbols.length]
+ symbols[i % symbols.length];
}
//Create Hashes for each word in Word List
private Map createWordNumberingMap()
{
int number = 0;
wordMap = new HashMap();
BufferedReader br = null;
String str = "";
//First Read The File
File readingFile = new File("D:/Eclipse WorkSpace EE/HashCreator/WordList/NewWordsList.txt");
try
{
br = new BufferedReader(new FileReader(readingFile));
while((str=br.readLine())!=null)
{
str = str.trim();
String id = getSequence(number);
wordMap.put(str,id);
number++;
System.out.println(id);
}
br.close();
System.out.println("Completed");
System.out.println(wordMap.get("000"));
System.out.println("Last Number: "+number);
}
catch(Exception e)
{
e.printStackTrace();
}
finally
{
try
{
br.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
return wordMap;
}
}
我尽力减少代码,以便发布简短的内容。上面的代码仍然很大,但没有它的所有部分,你将无法理解它。
现在我的问题是,我的程序非常慢。如果我在应用程序中插入50个单词,则需要1个多小时才能完成我之前解释过的工作。我已经尝试了2周才找到解决方案,但我做不到。仅供参考,读取77MB哈希文件所需的时间不超过12秒。还有别的错误。
答案 0 :(得分:0)
您是否考虑过尝试测量一些循环所花费的时间来确定工作重物的位置?