我正在使用FileInputStream
逐个字符地阅读Java中的9 KB文本文件,并且花了将近一分钟的时间来阅读。这种表现是否良好,或者可以通过使用其他Stream
BufferdReader
并在内存中一次读取整个数据来优化这一点。
// This method is used to read the Brown Corpus
public void readBrownCorpus(String corpusPath) throws IOException {
FileInputStream inputStream = null;
try {
inputStream = new FileInputStream(corpusPath);
int letter = 0; // denote current read letter
String previousTag = "^";
StringBuilder wordWithTag = new StringBuilder(); // denote the string which
while((letter = inputStream.read()) != -1) {
if(((char) letter) != ' ')
wordWithTag.append((char) letter);
else {
String word[] = wordWithTag.substring(0).split("_");
if(word != null && word.length != 2)
throw new Exception("Error in the Format of Corpus");
// If new tag found, insert this in both transitionTable and emissionTable
if(transitionTable.get(word[1]) == null) {
insertTagInTransitionTable(previousTag, word[1]);
}
updateTranstionTable(previousTag, word[1]);
updateEmissionTable(word[0], word[1]);
// update the previous Tag
if(word[1].equals("."))
previousTag = "^";
else
previousTag = word[1];
wordWithTag.setLength(0); //empty the wordWithTag for new word
System.out.println(transitionTable.size());
}
}
} catch(IOException ioException) {
ioException.printStackTrace();
} catch(Exception exception) {
exception.printStackTrace();
}
finally {
inputStream.close();
}
}
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package demo;
import java.util.*;
import java.io.*;
/**
*
* @author Jatin Khurana
*/
public class Main {
public HashMap<String,Row> transitionTable; // Transition Table
public HashMap<String,Row> emissionTable; // Emission Table
// Constructor
public Main()
{
transitionTable=new HashMap<String,Row>();
emissionTable=new HashMap<String,Row>();
prepareInitialTransitionTable();
//prepareInitialEmissionTable();
}
// This method prepare the initial transition Table
private void prepareInitialTransitionTable()
{
Row row1=new Row();
row1.tagCount.put("^", 0.0f);
row1.tagCount.put(".", 0f);
Row row2=new Row();
row2.tagCount.put("^", 0f);
row2.tagCount.put(".", 0f);
transitionTable.put("^", row1);
transitionTable.put(".", row2);
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws IOException{
Main m=new Main();
BufferedReader inputStream=null;
try
{
inputStream=new BufferedReader(new FileReader("d://postagger//corpus//brown.txt"));
String corpusData = inputStream.readLine();
String previousTag="^";
String wordWithTag[] = corpusData.split(" ");
for(int i=0;i<wordWithTag.length;i++)
{
String word[]=wordWithTag[i].split("_");
if(word!=null && word.length!=2)
throw new Exception("Error in the Format of Corpus");
// If new tag found,insert this in both transitionTable and emissionTable
if(m.transitionTable.get(word[1])==null)
{
m.insertTagInTransitionTable(previousTag,word[1]);
}
m.updateTranstionTable(previousTag,word[1]);
m.updateEmissionTable(word[0],word[1]);
// update the previous Tag
if(word[1].equals("."))
{
previousTag="^";
}
else
{
previousTag=word[1];
}
System.out.println(m.transitionTable.size());
}
}
catch(IOException ioException)
{
ioException.printStackTrace();
}
catch(Exception exception)
{
exception.printStackTrace();
}
finally
{
inputStream.close();
}
}
private void insertTagInTransitionTable(String previousTag,String newTag) throws CloneNotSupportedException
{
Row row = (Row)transitionTable.get(previousTag);
row.tagCount.put(newTag,0f);
Row newRow=new Row();
transitionTable.put(newTag, newRow);
}
// This method is used to update the transitionTable
private void updateTranstionTable(String previousTag,String currentTag)
{
Row row = transitionTable.get(previousTag);
if(row.tagCount.get(currentTag)==null)
{
row.tagCount.put(currentTag, 1f);
}
else
{
row.tagCount.put(currentTag, row.tagCount.get(currentTag)+1);
}
}
// This method is used to update the emission table
private void updateEmissionTable(String word,String tag)
{
Row row = emissionTable.get(word);
if(row==null)
{
Row newRow=new Row();
newRow.tagCount.put(tag, 1f);
emissionTable.put(word, newRow);
}
else
{
if(row.tagCount.get(tag)==null)
{
row.tagCount.put(tag, 1f);
}
else
{
row.tagCount.put(tag,row.tagCount.get(tag)+1);
}
}
}
}
我的导师说我必须在3到5秒内完成同样的操作。这是怎么做到的?
答案 0 :(得分:0)
使用BufferedInputStream包装FileInputStream以进行简单的快速修复。然后考虑使用readLine()