无法运行导入org.apache.lucene.analysis.Analyzer的Java程序

时间:2014-11-02 15:28:20

标签: apache maven hadoop lucene mahout

我无法运行导入org.apache.lucene.analysis.Analyzer的java程序;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.mahout.classifier.naivebayes.BayesUtils;
import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.apache.mahout.vectorizer.TFIDF;

import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;


public class Classifier {

 public static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) {
  Map<String, Integer> dictionnary = new HashMap<String, Integer>();
  for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) {
   dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
  }
  return dictionnary;
 }

 public static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) {
  Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
  for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(documentFrequencyPath, true, conf)) {
   documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
  }
  return documentFrequency;
 }

 public static void main(String[] args) throws Exception {

  System.out.println("Start time :" + System.currentTimeMillis());
  if (args.length < 5) {
   System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
   return;
  }
  String modelPath = args[0];
  String labelIndexPath = args[1];
  String dictionaryPath = args[2];
  String documentFrequencyPath = args[3];
  String testFilePath = args[4];

  Configuration configuration = new Configuration();

  // model is a matrix (wordId, labelId) => probability score
  NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

  StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

  // labels is a map label => classId
  Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
  Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
  Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath));


  // analyzer used to extract word from tweet
  Analyzer analyzer = new DefaultAnalyzer();

  int labelCount = labels.size();
  int documentCount = documentFrequency.get(-1).intValue();

  System.out.println("Number of labels: " + labelCount);
  System.out.println("Number of documents in training set: " + documentCount);
  BufferedReader reader = new BufferedReader(new FileReader(testFilePath));

  String outputFile = "/home/hduser/result.txt";
  FileWriter f1 = new FileWriter(outputFile,true); 
  BufferedWriter out = new BufferedWriter(f1);



  int correctCounter=0;
  int totalCounter=0;
  while(true)
  {
   String line = reader.readLine();
   if (line == null) {
    break;
   }

   String[] arr = line.split(" ");
   String catId = arr[0];
   String label = arr[1];

   String msg = line.substring(arr[0].length() + arr[1].length() + 2);


   Multiset<String> words = ConcurrentHashMultiset.create();

   // extract words from Msg
   TokenStream ts = analyzer.reusableTokenStream("text", new StringReader(msg));
   CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
   ts.reset();
   int wordCount = 0;
   while (ts.incrementToken()) {
    if (termAtt.length() > 0) {
     String word = ts.getAttribute(CharTermAttribute.class).toString();
     Integer wordId = dictionary.get(word);
     // if the word is not in the dictionary, skip it
     if (wordId != null) {
      words.add(word);
      wordCount++;
     }
    }
   }

   // create vector wordId => weight using tfidf
   Vector vector = new RandomAccessSparseVector(10000);
   TFIDF tfidf = new TFIDF();
   for (Multiset.Entry<String> entry:words.entrySet()) {
    String word = entry.getElement();
    int count = entry.getCount();
    Integer wordId = dictionary.get(word);
    Long freq = documentFrequency.get(wordId);
    double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
    vector.setQuick(wordId, tfIdfValue);
   }
   // With the classifier, we get one score for each label 
   // The label with the highest score is the one the tweet is more likely to
   // be associated to
   Vector resultVector = classifier.classifyFull(vector);
   //double bestScore = -Double.MAX_VALUE;
   double bestScore =Double.MAX_VALUE;
   int bestCategoryId = -1;
   String resultStr=catId+" ";
   for(Element element: resultVector) 
   {
    int categoryId = element.index();
    double score = -1 * element.get();
    if (score < bestScore) {
     bestScore = score;
     bestCategoryId = categoryId;
    }
    //System.out.print("  " + labels.get(categoryId) + ": " + score);
    if(resultStr.equalsIgnoreCase(catId + " "))
    {
     resultStr=resultStr + labels.get(categoryId) + " " + score;
    }
    else
    {
     resultStr=resultStr + "   " + labels.get(categoryId) + " " + score;
    }
   }
   try
   {

     out.write(resultStr);
     out.write("\n");

   }
   catch(Exception e)
   {

   }

   //System.out.println(label + " => " + labels.get(bestCategoryId));
   out1.write(label + " => " + labels.get(bestCategoryId));
    out1.write("\n");
   totalCounter++;
    if(label.equalsIgnoreCase(labels.get(bestCategoryId)))
    {

     correctCounter++;
     System.out.println("correctCounter : " + correctCounter);
    }
  };
   //Close the output stream
  System.out.println("correctCounter : " + correctCounter + " TotalCounter :" + totalCounter);
  System.out.println("End time :" + System.currentTimeMillis());
  System.out.println("Accuracy : " +  (double)correctCounter/totalCounter);
   out.close();
 }
}

我的ubuntu系统上的配置:Hadoop 1.2.0,Mahout 0.7,Lucene 1.4.1,Java 1.6

我编译了java程序而没有任何错误,并生成了一个jar文件:

java c -classpath /usr/local/hadoop/*:/usr/local/mahout/*:/usr/local/lucene/* -d Classifier_Class/ Classifier.java

jar -cvf Classify.jar -C Classifier_CLass/ .    

当我尝试使用hadoop执行此jar时,我面临以下错误:

hadoop jar Classify.jar {input arguments}
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/lucene/analysis/Analyzer
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:270)
    at org.apache.hadoop.util.RunJar.main(RunJar.java:153)
Caused by: java.lang.ClassNotFoundException: org.apache.lucene.analysis.Analyzer
    at java.net.URLClassLoader$1.run(URLClassLoader.java:217)
    at java.security.AccessController.doPrivileged(Native Method)
    at java.net.URLClassLoader.findClass(URLClassLoader.java:205)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:323)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:268)
    ... 3 more

1 个答案:

答案 0 :(得分:0)

您的问题是,当您构建应用程序的jar时,Lucene(.jar文件)的maven依赖项不包含在您的jar中,也就是说,Java无法找到您的应用程序使用的Lucene类。我有相同的堆栈跟踪问题。

您最好使用Maven来构建项目。

你需要做的是建立具有依赖性的源,然后,你编译的应用程序将包括Lucene的罐子。

为此,只需在Maven项目的.pom文件中添加以下代码行

即可
    <build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.4</version>
                <configuration>
                  <descriptorRefs>
                     <descriptorRef>jar-with-dependencies</descriptorRef>
                  </descriptorRefs>    
               </configuration>
               <executions>
                  <execution>
                     <id>make-assembly</id>
                     <phase>package</phase>
                     <goals>                         
                       <goal>single</goal>
                     </goals>
                 </execution>
              </executions>
            </plugin>
        </plugins>
    </build>

现在,运行您的应用程序:hadoop jar APP_NAME-jar-with-dependencies.jar

以上所有内容都可以解决您的问题。