如何打印mahout lda cvb主题

时间:2013-06-07 23:50:53

标签: java cluster-analysis mahout lda

我想使用mahout CVB0Driver API运行集群lda cvb作业。但我不知道如何打印结果。这是我的代码。

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.jobcontrol.Job;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.lda.cvb.CVB0Driver;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.text.SequenceFilesFromDirectory;
import org.apache.mahout.utils.vectors.RowIdJob;
import org.apache.mahout.utils.vectors.VectorDumper;
import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LDAJob extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(Job.class);
    static int numTopics = 20;
    static double doc_topic_smoothening = 0.0001;
    static double term_topic_smoothening = 0.0001;
    static int maxIter = 10;
    static int iteration_block_size = 10;
    static double convergenceDelta = 0;
    static float testFraction = 0.0f;
    static int numTrainThreads = 4;
    static int numUpdateThreads = 1;
    static int maxItersPerDoc = 10;
    static int numReduceTasks = 10;
    static boolean backfillPerplexity = false;

public static void main(String args[]) throws Exception {
    // String baseFileLocation = args[0];
    String baseFileLocation = "/Users/pin/java";
    Path output = new Path(baseFileLocation, "/output");
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, output);
    String[] ldaArgs = { "-DbaseFileLocation=" + baseFileLocation };
    // String[] strings =
    // {"-Dmapred.input.dir=VectorFile/tfidf-vectors/part-r-00000"};
    ToolRunner.run(new LDAJob(), ldaArgs);
    System.out.println("done");
}

public int run(String[] arg0) throws Exception {
    Configuration conf = getConf();
    // String baseFileLocation = "/Users/pin/java";
    String baseFileLocation = conf.get("baseFileLocation");
    Path input = new Path(baseFileLocation, "/reuters-out");
    System.out.println(input.toString());
    String seqFileOutput = "SeqFile";
    String vectorOutFile = "VectorFile";
    String rowIDOutFile = "RowIdOutput";
    String ldaOutputFile = "topicModelOutputPath";
    String dictionaryFileName = vectorOutFile + "/dictionary.file-0";
    String tempLDAModelFile = "modelTempPath";
    String docTopicOutput = "docTopicOutputPath";
    String topicTermVectorDumpPath = "topicTermVectorDump";
    String docTopicVectorDumpPath = "docTopicVectorDump";

    // String topicTermVectorDump = "topicTermVectorDump";

    log.info("Deleting all the previous files.");
    HadoopUtil.delete(conf, new Path(seqFileOutput));
    HadoopUtil.delete(conf, new Path(vectorOutFile));
    HadoopUtil.delete(conf, new Path(rowIDOutFile));
    HadoopUtil.delete(conf, new Path(ldaOutputFile));
    HadoopUtil.delete(conf, new Path(docTopicOutput));
    HadoopUtil.delete(conf, new Path(tempLDAModelFile));
    HadoopUtil.delete(conf, new Path(topicTermVectorDumpPath));
    HadoopUtil.delete(conf, new Path(docTopicVectorDumpPath));

    // S3FileSystem.
    log.info("Step1: convert the directory into seqFile.");
    System.out.println("starting dir to seq job");
    String[] dirToSeqArgs = { "--input", input.toString(), "--output",
            seqFileOutput };
    ToolRunner.run(new SequenceFilesFromDirectory(), dirToSeqArgs);
    System.out.println("finished dir to seq job");

    log.info("Step 2: converting the seq to vector.");
    System.out.println("starting seq To Vector job");
    String[] seqToVectorArgs = { "--input", seqFileOutput, "--output",
            vectorOutFile, "--maxDFPercent", "70", "--maxNGramSize", "2",
            "--namedVector", "--analyzerName",
            "org.apache.lucene.analysis.WhitespaceAnalyzer" };
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), seqToVectorArgs);
    System.out.println("finished seq to vector job");

    log.info("Step3: convert SequenceFile<Text, VectorWritable> to  SequenceFile<IntWritable, VectorWritable>");
    System.out.println("starting rowID job");
    String[] rowIdArgs = {
            "-Dmapred.input.dir=" + vectorOutFile
                    + "/tfidf-vectors/part-r-00000",
            "-Dmapred.output.dir=" + rowIDOutFile };
    ToolRunner.run(new RowIdJob(), rowIdArgs);
    System.out.println("finished rowID job");

    log.info("Step4: Run the LDA algo");
    System.out.println("starting caluclulating the number of terms");
    //int numTerms = getNumTerms(new Path(dictionaryFileName));
    System.out.println("finished calculating the number of terms");
    long seed = System.nanoTime() % 10000;
    System.out.println("starting the CVB job");
    CVB0Driver.run(conf, new Path(rowIDOutFile + "/matrix"), new Path(
            ldaOutputFile), numTopics, 0, doc_topic_smoothening,
            term_topic_smoothening, maxIter, iteration_block_size,
            convergenceDelta, new Path(dictionaryFileName), new Path(
                    docTopicOutput), new Path(tempLDAModelFile), seed,
            testFraction, numTrainThreads, numUpdateThreads,
            maxItersPerDoc, numReduceTasks, backfillPerplexity);
    //String[] runArgs ={};
    System.out.println("finished the cvb job");

    log.info("Step5: vectordump topic-term");

    System.out.println("starting the vector dumper for topic term");
    String[] topicTermDumperArg = {"--seqFile", ldaOutputFile+"/part-m-00000",  "--dictionary", 
            dictionaryFileName, "-dt", "sequencefile"  };
    //ToolRunner.run(new Configuration(), new CustomVectorDumper(), topicTermDumperArg);
    //VectorDumper.main(topicTermDumperArg);
    //SequenceFileDumper.main(topicTermDumperArg);
    //String[] topicTermDumperArg = {"--input", ldaOutputFile, "--output", topicTermVectorDumpPath,  "--dictionary", 
    //        dictionaryFileName, "-dt", "sequencefile" ,"--vectorSize", "25" ,"-sort", "testsortVectors" };
    //LDAPrintTopics.main(topicTermDumperArg);
    //String[] topicTermDumperArg = {"-seq"};
    VectorDumper.main(topicTermDumperArg);
    System.out.println("finisher the vector dumper for topicterm");
    //System.out.println("starting the vector dumper for doctopic dumper");
    //String[] docTopicDumperArg = {"--input", docTopicOutput, "--output", docTopicVectorDumpPath};
    //ToolRunner.run(new Configuration(), new CustomVectorDumper(), docTopicDumperArg);
    //VectorDumper.main(docTopicDumperArg);
    System.out.println("finsiher the vector dumper for doctopic dumper");

    //printLdaResults(ldaOutputFile, numTerms);
    //MongoDumper dumper = new MongoDumper();
    //dumper.writeTopicCollection(topicTermVectorDumpPath.toString());
    return 0;
}
}

程序在运行到VectorDumper.main(topicTermDumperArg);时卡住了。

我使用mahout-core-0.7,mahout-utils-0.5, 下载新闻资源click here

2 个答案:

答案 0 :(得分:1)

我正在使用Mahout 0.9,以下内容适用于我:

     VectorDumper.main(new String[] 
            { "-i",
            OUTPUT_DIR + "/topic-term-dist/part-m-00000", "-o",
            OUTPUT_DIR + "/results", "-d",
                OUTPUT_DIR + "/dictionary.file-0", "-dt", "sequencefile",
            "-sort", "true", "-vs", "20" });

上面,OUTPUT_DIR是我运行LDA作业的文件夹。它打印每个主题的前20个术语。

答案 1 :(得分:0)

这对我有用

您应该获得术语数量并使用它运行CVB。然后,

    for(int k=0;k<nTopics;k++){
        System.out.println("Dumping topic \t"+k);
        String partFile="part-m-0000"+k;
        if(k>=10)
            partFile="part-m-000"+k;

        String output="topic"+k;
        String[] topicTermDumperArg = {"-s", ldaOutputFile+"/"+partFile, "-dt", "sequencefile", "-d", 
            dictionaryFileName, "-o",output,  "-c", };  

        VectorDumper.main(topicTermDumperArg);

    }