如何用lucene从推文中提取日期?

时间:2017-05-26 17:45:46

标签: twitter lucene

我有一些我已编入索引的推文" TweetIndexer"与lucene知道每条推文包含ID,USER,TEXT和DATE。

我想只检索另一个班级的日期" TweetSearcher"怎么办?

推文示例:

"0","1467811372","Mon Apr 06 22:20:00 PDT 2009","NO_QUERY","joy_wolf","@Kwesidei not the whole crew ". 

我的班级TweetIndexer

   import org.apache.lucene.analysis.core.KeywordAnalyzer;
   import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;
   import org.apache.lucene.document.FieldType;
   import org.apache.lucene.document.StringField;
   import org.apache.lucene.index.IndexWriter;
   import org.apache.lucene.index.IndexWriterConfig;
   import org.apache.lucene.store.FSDirectory;
   import org.apache.lucene.util.Version;
   import java.io.BufferedReader;
   import java.io.File;
   import java.io.FileReader;
   import java.io.IOException;
   public class TweetIndexer {

protected static final String COMMA = "\",\"";
protected static final String POLARITY = "polarity";
protected static final String ID = "id";
protected static final String DATE = "date";
protected static final String QUERY = "query";
protected static final String USER = "user";
protected static final String TEXT = "text";

public static void main(String[] args) throws Exception {
    try {
        String indexDir = "D:\\tweet\\index";
        String dataFile = "D:\\tweet\\collection\\tweets.csv";

        TweetIndexer tweetIndexer = new TweetIndexer();

        long start = System.currentTimeMillis();

        int count = tweetIndexer.index(new File(indexDir), new File(dataFile));

        System.out.print(String.format("Indexed %d documents in %d seconds", count, (System.currentTimeMillis() - start) / 1000));
    }
    catch (Exception e) {
        System.out.println("Usage: java TweetIndexer <index directory> <csv data file>");
    }
}

private int index(File indexDir, File dataFile) throws Exception {
    IndexWriter indexWriter = new IndexWriter(
            FSDirectory.open(indexDir),
            new IndexWriterConfig(Version.LUCENE_44, new KeywordAnalyzer()));

    int count = indexFile(indexWriter, dataFile);

    indexWriter.close();

    return count;
}

private int indexFile(IndexWriter indexWriter, File dataFile) throws IOException {
    FieldType fieldType = new FieldType();
    fieldType.setStored(true);
    fieldType.setIndexed(true);

    BufferedReader bufferedReader = new BufferedReader(new FileReader(dataFile));
    String line = "";
    int count = 0;
    while ((line = bufferedReader.readLine()) != null) {
        // Hack to ignore commas within elements in csv (so we can split on "," rather than just ,)
        line = line.substring(1, line.length() - 1);
        String[] tweetInfo = line.split(COMMA);

        Document document = new Document();

        document.add(new Field(POLARITY, tweetInfo[0], fieldType));
        document.add(new Field(ID, tweetInfo[1], fieldType));
        document.add(new Field(DATE, tweetInfo[2], fieldType));
        document.add(new Field(QUERY, tweetInfo[3], fieldType));
        document.add(new StringField(USER, tweetInfo[4], Field.Store.YES));
        document.add(new StringField(TEXT, tweetInfo[5], Field.Store.YES));

        indexWriter.addDocument(document);
        count++;
    }
    return count;
}

}

这是我的类TweetSearcher的简短java代码:

      public class TweetSearcher {

      public static void main(String[] args) throws Exception {

        try { 
             String indexDir = "D:\\tweet\\index";

             int numHits = Integer.parseInt("3");
             TweetSearcher tweetSearcher = new TweetSearcher();
             tweetSearcher.dateSearch(new File(indexDir), numHits);
             private void dateSearch(File indexDir, int numHits) throws Exception {
    System.out.println("Find dates:");

    Directory directory = FSDirectory.open(indexDir);

    DirectoryReader directoryReader = DirectoryReader.open(directory);

    IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

0 个答案:

没有答案