这是我计算文档集合中文档的TF-IDF值的程序。 这工作正常,但在计算“IDF”值(查找包含特定术语的文档的数量)时需要花费大量时间。
是否有更有效的方法来查找包含特定术语的文档?
freq = termsFreq.getTermFrequencies();
terms = termsFreq.getTerms();
int noOfTerms = terms.length;
score = new float[noOfTerms];
DefaultSimilarity simi = new DefaultSimilarity();
for (i = 0; i < noOfTerms; i++) {
int noofDocsContainTerm = noOfDocsContainTerm(terms[i]);
float tf = simi.tf(freq[i]);
float idf = simi.idf(noofDocsContainTerm, noOfDocs);
score[i] = tf * idf ;
}
////
public int noOfDocsContainTerm(String querystr) throws CorruptIndexException, IOException, ParseException{
QueryParser qp=new QueryParser(Version.LUCENE_35, "docuemnt", new StandardAnalyzer(Version.LUCENE_35));
Query q=qp.parse(querystr);
int hitsPerPage = docNames.length; //minumum number or search results
IndexSearcher searcher = new IndexSearcher(ramMemDir, true);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
return hits.length;
}
答案 0 :(得分:6)
如果您有一个术语并希望其文档频率,即包含该术语的文档数量:调用IndexReader.termEnum(Term)方法。它为您提供了TermEnum对象。然后,调用TermEnum.docFreq()方法。它为您提供索引中术语的文档频率。
答案 1 :(得分:3)
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
/*
* Date Author Changes April 14, 2012 Kasun Perera Created
*/
/*
*
* Class contains methods for indexing documents with Lucene, and calculating
* TFIDF weights
*/
public class DocIndexer {
private String docNames[];
private String docIDS[];
private String pathToIndex;
private String pathToDocumentCollection;
private String fiboTermList[]; //marked up fibo terms
private String taxoTermList[]; // marked up taxonomy terms
private RAMDirectory ramMemDir;
private String fileNames[];
private byte files[][];
private String filesInText[];
int noOfWordsOfDOc[];
int noOfSentencesOfDoc[];
ArrayList<String> ArrLstSentencesOfDoc[];
String removedTermsOfDOc[][];
int freqAfterRemovalOfDoc[][];
//int queryDocIndex ;
private int curDocNo;
private final int maxTerms = 1000000;
/**
* Constructor used when indexing directory is a RAM memory directory, We
* need RAM directory because Stratoes Server dosen't allow access local
* files
*
* @param pathToIndex- doc index path
* @param pathToDocumentCollection - doccollection path
*/
public DocIndexer(String pathToIndex, String pathToDocumentCollection) {
// this.docNames = docNames;
//this.bufPathToIndex= new RandomAccessBuffer() ;
// this.ramMemDir = new RAMDirectory();
this.pathToIndex = pathToIndex;
this.pathToDocumentCollection= pathToDocumentCollection;
// this.files = files;
// this.filesInText = docContent;
}
/**
* Count the number of words in a given String
*
* @param line- Input String
* @return - number of words in the input String
*/
private int wordCount(String line) {
int numWords = 0;
int index = 0;
boolean prevWhiteSpace = true;
while (index < line.length()) {
char c = line.charAt(index++);
boolean currWhiteSpace = Character.isWhitespace(c);
if (prevWhiteSpace && !currWhiteSpace) {
numWords++;
}
prevWhiteSpace = currWhiteSpace;
}
return numWords;
}
/*
*given it's URL this methods read the text files
*/
public static String fileReader(String filename) throws IOException {
String filetext = null;
BufferedReader reader = null;
//BufferedReader namesReader; //reader for followers
//Extractor extractor = new Extractor();
File inFile = new File(filename);
//File namesFile = new File(args[1]); //get followers file
//File userFile = new File(args[1]);
//READING FROM USERS FILE
reader = new BufferedReader(new FileReader(inFile));
String line = null;
int numLine = 0;
while ((line = reader.readLine()) != null) {
// numLine++;
filetext = filetext + " " + line;
// System.out.println(line);
}
reader.close();
return filetext;
}
/**
* Method to index the documents only using the content of the document
* "docid" field is used for indexing, since Lucene Dosen't retrieve the
* documents in the indexed order
*
* @param docNo- document number of the document to be indexed
* @throws IOException
*/
public void indexDocs() throws IOException {
//String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\";
// String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs";
File folder = new File(pathToDocumentCollection);
File[] listOfFiles = folder.listFiles();
int noOfFiles = listOfFiles.length;
System.out.println("Number of files : " + noOfFiles);
IndexWriter iW;
int indexDocCount = 0;
try {
NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex));
iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36)));
for (int i = 0; i < noOfFiles; i++) {
if (listOfFiles[i].isFile()) {
String docName = listOfFiles[i].getName();
System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length());
if (listOfFiles[i].length() > 1) {
String filesInText = fileReader(pathToDocumentCollection + docName);
//docIds[i] = docNames[i].substring( 0, docName.length() - 4 );
System.out.println("Added to index : " + docName);
// StringReader strRdElt = new StringReader(filesInText[i]);
//filesInText = filesInText.replaceAll( "[^A-Za-z_]", " " );
//System.out.println( "Added to index : " + docName );
StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", ""));
StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here
org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
doc.add(new Field("docid", docId, Field.TermVector.YES));
iW.addDocument(doc);
indexDocCount++;
}
}
}
System.out.println("no of documents added to index : " + indexDocCount);
iW.close();
// dir.close() ;
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* This method calculates the TF-IDF score for each terms in the indexed
* documents
*
* @param numberOfDocs
* @return - Hashmap of TF-IDF score per each term in document wise
* @throws CorruptIndexException
* @throws ParseException
*/
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {
int noOfDocs = docNames.length;
HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
//HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
try {
IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ;
// IndexReader re = IndexReader.open(ramMemDir);
int i = 0;
for (int k = 0; k < numberOfDocs; k++) {
int freq[];
TermFreqVector termsFreq;
TermFreqVector termsFreqDocId;
//TermFreqVector termsFreq3[];
HashMap<String, Float> wordMap = new HashMap<String, Float>();
String terms[];
float score[] = null;
//termsFreq3=re.getTermFreqVectors(currentDocID);
termsFreq = re.getTermFreqVector(k, "doccontent");
termsFreqDocId = re.getTermFreqVector(k, "docid");
int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
freq = termsFreq.getTermFrequencies();
terms = termsFreq.getTerms();
int noOfTerms = terms.length;
score = new float[noOfTerms];
DefaultSimilarity simi = new DefaultSimilarity();
for (i = 0; i < noOfTerms; i++) {
int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
// System.out.println(terms[i]+"\t"+freq[i]);
//int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
float tf = simi.tf(freq[i]);
float idf = simi.idf(noofDocsContainTerm, noOfDocs);
wordMap.put(terms[i], (tf * idf));
}
scoreMap.put(aInt, wordMap);
}
} catch (IOException e) {
// score = null;
e.printStackTrace();
}
//Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>();
return scoreMap;
}
public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException {
int noOfDocs = docNames.length;
float tfIdfScore[][] = new float[noOfDocs][];
//HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
scoreMap = tfIdfScore(noOfDocs);
return scoreMap;
}
}
答案 2 :(得分:0)
有一种有效的方法可以以简洁,优化的方式使用Lucene api计算idf。在您为文档建立索引之前,您可以使用Lucene api中的静态函数。下面的代码一起计算tf和idf:
public double getTFIDFScoreInCollection(String FIELD, String word,IndexReader reader)
throws IOException {
IndexSearcher searcher = new IndexSearcher(reader);
ClassicSimilarity similarity = new ClassicSimilarity();
IndexReaderContext context = searcher.getTopReaderContext();
CollectionStatistics collectionStats = searcher.collectionStatistics(FIELD);
long totalDocCount = collectionStats.docCount();
BytesRef ref = new BytesRef(word);
long termFreq = this.getTermFrequencyInCollection(FIELD,word);
float tf = similarity.tf(termFreq);
Term term = new Term(FIELD, ref);
TermContext termContext = TermContext.build(context, term);
TermStatistics termStats = searcher.termStatistics(term, termContext);
long docFreq = termStats.docFreq();
float idf = similarity.idf(docFreq, totalDocCount);
return tf*idf;
}
不要忘记导入适当的依赖项:
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.util.BytesRef;