我尝试使用此代码来索引,搜索和解析csv推文

时间:2017-05-30 16:15:14

标签: java parsing search indexing lucene

我尝试使用此代码来索引,搜索和解析csv推文,但在执行时第一条推文的字段未显示且搜索类不起作用,有人可以帮助我

   import java.io.*;
import java.util.StringTokenizer;
import java.util.Scanner;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
class TweetDoc {
    protected static  String ID = "id";
    protected static String MAT="mat";
    protected static  String DATE = "date";
    protected static  String QUERY = "query";
    protected static  String USER = "user";
    protected static  String TEXT = "text";

TweetDoc(  String i,String m, String d, String q, String u, String t) {

                TweetDoc.ID=i;
                TweetDoc.MAT=m;
                TweetDoc.DATE = d;
                TweetDoc.QUERY=q;
                TweetDoc.USER = u;
                TweetDoc.TEXT=t;

                }}
public class Lucenetweet {
    public static final String INDEX_DIR = "D:\\Tweets\\index";
    public static void main(String[] args) throws CorruptIndexException, IOException {

    if (args.length == 0) {
//READ FROM FILES
        BufferedReader reader = null ;
                int count = 0;




        try {

            File file = new File("D:\\Tweets\\collection\\tweets.csv");
            while (file.exists()) {
                System.out.println("Reading from file '" + file + "'...");
            reader = new BufferedReader(new FileReader(file));
                // Read every line in the file, and parse each tweet.
                for (String line; (line = reader.readLine()) != null; ) {
                    count++; //Count number of tweets
                   System.out.println("Tweets = " + count);
                  Scanner s = new Scanner(line).useDelimiter("\",\"");
                       String ID=s.next();
                       String MAT=s.next();
                       String DATE = s.next();
                       String QUERY = s.next();
                   String USER = s.next();
                       String TEXT=s.next();

                       String i= TweetDoc.ID;
                       System.out.println("l'identificateur est: " +i); 
                       String d=TweetDoc.DATE;
                       System.out.println("la date est :" +d);
                       String m=TweetDoc.MAT;
                       System.out.println("la matricule est :" +m );
                       String t=TweetDoc.TEXT;
                       System.out.println("le texte est: " +t);

                  TweetDoc tweet1 = new TweetDoc( ID,MAT, DATE, QUERY, USER, TEXT);    
                  index(tweet1);

                }

                    reader.close();
                System.out.println("Current number of tweets = " + count);
                //file_no++;
                //file = new File("D:\\tweet\\collection\\tweet"+file_no+".csv");

            }

        }
catch (IOException e) {
            e.printStackTrace();
        }

        finally {

           try {
                reader.close();
                System.out.println("Total number of tweets = " + count);
            }
           catch (IOException e) {
                e.printStackTrace();
            }}


        }
    }
public static void index (TweetDoc tweet) {
        File index = new File(INDEX_DIR);
        IndexWriter writer = null;

        try {   
            IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
            writer = new IndexWriter(FSDirectory.open(index), indexConfig);
            Document luceneDoc = new Document();
            //luceneDoc.add(new Field("POLARITY", tweet.POLARITY, Field.Index.NO));
                        //luceneDoc.add(new Field("ID", TweetDoc.ID, Field.Index.NO));
                        luceneDoc.add(new Field("DATE", tweet.DATE, Field.Store.YES, Field.Index.NO));
                        //luceneDoc.add(new Field("QUERY", tweet.QUERY, Field.Index.NO));
            luceneDoc.add(new Field("USER", tweet.USER, Field.Store.YES, Field.Index.NO));
            luceneDoc.add(new Field("TEXT", tweet.TEXT, Field.Store.YES, Field.Index.ANALYZED));
            //luceneDoc.add(new Field("ptitle", tweet.ptitle, Field.Store.YES, Field.Index.ANALYZED));
            luceneDoc.setBoost((float)2.0);
            writer.addDocument(luceneDoc);
} catch (Exception ex) {
            ex.printStackTrace();
        } finally {
            if (writer !=null)
                try {
                    writer.close();
                } catch (CorruptIndexException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                }
        }

    }
 public static String[] search (String queryString, int topk) throws CorruptIndexException, IOException {

        IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(INDEX_DIR)));
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        QueryParser queryparser = new QueryParser(Version.LUCENE_36, "TEXT", new StandardAnalyzer(Version.LUCENE_36));
    try {
            StringTokenizer strtok = new StringTokenizer(queryString, " ~`!@#$%^&*()_-+={[}]|:;'<>,./?\"\'\\/\n\t\b\f\r");
            String querytoparse = " ";
            while(strtok.hasMoreElements()) {
                String token = strtok.nextToken();
                      querytoparse += "text:" + token;
}       
            Query query = queryparser.parse(querytoparse);
            System.out.println(query.toString());
            TopDocs results = indexSearcher.search(query, topk);
            int num_results = results.scoreDocs.length;
            System.out.println(num_results);
            String[] returnTweets = new String[num_results];
            for (int i = 0; i < num_results; i++) {
                                String temp = "@" + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("USER").stringValue();
                String DATE = indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("DATE").stringValue();
                DATE = DATE.replace("+0000", "");
                temp += ": " + indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("QUERY").stringValue();
                temp += "<br/>" + DATE + "    Score: " +  results.scoreDocs[i].score;;
                System.out.println(indexSearcher.doc(results.scoreDocs[i].doc).getFieldable("TEXT").stringValue());
                System.out.println("score: " + results.scoreDocs[i].score);
                returnTweets[i] = temp;

            }


            return returnTweets;            
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            indexSearcher.close();
        }
        return null;
    }


}

0 个答案:

没有答案