我尝试使用此代码来索引,搜索和解析csv推文,但在执行时第一条推文的字段未显示且搜索类不起作用,有人可以帮助我
import java.io.*;
import java.util.StringTokenizer;
import java.util.Scanner;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
class TweetDoc {
protected static String ID = "id";
protected static String MAT="mat";
protected static String DATE = "date";
protected static String QUERY = "query";
protected static String USER = "user";
protected static String TEXT = "text";
TweetDoc( String i,String m, String d, String q, String u, String t) {
TweetDoc.ID=i;
TweetDoc.MAT=m;
TweetDoc.DATE = d;
TweetDoc.QUERY=q;
TweetDoc.USER = u;
TweetDoc.TEXT=t;
}}
public class Lucenetweet {
public static final String INDEX_DIR = "D:\\Tweets\\index";
public static void main(String[] args) throws CorruptIndexException, IOException {
if (args.length == 0) {
//READ FROM FILES
BufferedReader reader = null ;
int count = 0;
try {
File file = new File("D:\\Tweets\\collection\\tweets.csv");
while (file.exists()) {
System.out.println("Reading from file '" + file + "'...");
reader = new BufferedReader(new FileReader(file));
// Read every line in the file, and parse each tweet.
for (String line; (line = reader.readLine()) != null; ) {
count++; //Count number of tweets
System.out.println("Tweets = " + count);
Scanner s = new Scanner(line).useDelimiter("\",\"");
String ID=s.next();
String MAT=s.next();
String DATE = s.next();
String QUERY = s.next();
String USER = s.next();
String TEXT=s.next();
String i= TweetDoc.ID;
System.out.println("l'identificateur est: " +i);
String d=TweetDoc.DATE;
System.out.println("la date est :" +d);
String m=TweetDoc.MAT;
System.out.println("la matricule est :" +m );
String t=TweetDoc.TEXT;
System.out.println("le texte est: " +t);
TweetDoc tweet1 = new TweetDoc( ID,MAT, DATE, QUERY, USER, TEXT);
index(tweet1);
}
reader.close();
System.out.println("Current number of tweets = " + count);
//file_no++;
//file = new File("D:\\tweet\\collection\\tweet"+file_no+".csv");
}
}
catch (IOException e) {
e.printStackTrace();
}
finally {
try {
reader.close();
System.out.println("Total number of tweets = " + count);
}
catch (IOException e) {
e.printStackTrace();
}}
}
}
public static void index (TweetDoc tweet) {
File index = new File(INDEX_DIR);
IndexWriter writer = null;
try {
IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
writer = new IndexWriter(FSDirectory.open(index), indexConfig);
Document luceneDoc = new Document();
//luceneDoc.add(new Field("POLARITY", tweet.POLARITY, Field.Index.NO));
//luceneDoc.add(new Field("ID", TweetDoc.ID, Field.Index.NO));
luceneDoc.add(new Field("DATE", tweet.DATE, Field.Store.YES, Field.Index.NO));
//luceneDoc.add(new Field("QUERY", tweet.QUERY, Field.Index.NO));
luceneDoc.add(new Field("USER", tweet.USER, Field.Store.YES, Field.Index.NO));
luceneDoc.add(new Field("TEXT", tweet.TEXT, Field.Store.YES, Field.Index.ANALYZED));
//luceneDoc.add(new Field("ptitle", tweet.ptitle, Field.Store.YES, Field.Index.ANALYZED));
luceneDoc.setBoost((float)2.0);
writer.addDocument(luceneDoc);
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (writer !=null)
try {
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static String[] search (String queryString, int topk) throws CorruptIndexException, IOException {
IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(INDEX_DIR)));
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
QueryParser queryparser = new QueryParser(Version.LUCENE_36, "TEXT", new StandardAnalyzer(Version.LUCENE_36));
try {
StringTokenizer strtok = new StringTokenizer(queryString, " ~`!@#$%^&*()_-+={[}]|:;'<>,./?\"\'\\/\n\t\b\f\r");
String querytoparse = " ";
while(strtok.hasMoreElements()) {
String token = strtok.nextToken();
querytoparse += "text:" + token;
}
Query query = queryparser.parse(querytoparse);
System.out.println(query.toString());
TopDocs results = indexSearcher.search(query, topk);
int num_results = results.scoreDocs.length;
System.out.println(num_results);
String[] returnTweets = new String[num_results];
for (int i = 0; i < num_results; i++) {
String temp = "@" + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("USER").stringValue();
String DATE = indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("DATE").stringValue();
DATE = DATE.replace("+0000", "");
temp += ": " + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("QUERY").stringValue();
temp += "<br/>" + DATE + " Score: " + results.scoreDocs[i].score;;
System.out.println(indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("TEXT").stringValue());
System.out.println("score: " + results.scoreDocs[i].score);
returnTweets[i] = temp;
}
return returnTweets;
} catch (Exception e) {
e.printStackTrace();
} finally {
indexSearcher.close();
}
return null;
}
}