我正在使用Apache Spark的示例代码跟踪文档:https://spark.apache.org/docs/latest/ml-features.html#countvectorizer
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.feature.CountVectorizer;
import org.apache.spark.ml.feature.CountVectorizerModel;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.*;
public class CountVectorizer_Demo {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("LDA Online").setMaster(
"local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
// Input data: Each row is a bag of words from a sentence or document.
JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
RowFactory.create(Arrays.asList("a", "b", "c")),
RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
));
StructType schema = new StructType(new StructField [] {
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
});
DataFrame df = sqlContext.createDataFrame(jrdd, schema);
// fit a CountVectorizerModel from the corpus
CountVectorizerModel cvModel = new CountVectorizer()
.setInputCol("text")
.setOutputCol("feature")
.setVocabSize(3)
.setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
.fit(df);
// alternatively, define CountVectorizerModel with a-priori vocabulary
CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
.setInputCol("text")
.setOutputCol("feature");
cvModel.transform(df).show();
}
}
但我收到了错误消息:
15/10/22 23:04:20 INFO BlockManagerMasterActor:注册块管理器localhost:56882,内存为703.6 MB,BlockManagerId(,localhost,56882) 15/10/22 23:04:20 INFO BlockManagerMaster:已注册的BlockManager 线程“main”中的异常java.lang.NoClassDefFoundError:org / apache / spark / sql / catalyst / InternalRow at org.apache.spark.ml.feature.CountVectorizerParams $ class.validateAndTransformSchema(CountVectorizer.scala:72) 在org.apache.spark.ml.feature.CountVectorizer.validateAndTransformSchema(CountVectorizer.scala:107) 在org.apache.spark.ml.feature.CountVectorizer.transformSchema(CountVectorizer.scala:168) 在org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:62) 在org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:130) at main.CountVectorizer_Demo.main(CountVectorizer_Demo.java:39) 引起:java.lang.ClassNotFoundException:org.apache.spark.sql.catalyst.InternalRow 在java.net.URLClassLoader $ 1.run(URLClassLoader.java:366) 在java.net.URLClassLoader $ 1.run(URLClassLoader.java:355) at java.security.AccessController.doPrivileged(Native Method) 在java.net.URLClassLoader.findClass(URLClassLoader.java:354) at java.lang.ClassLoader.loadClass(ClassLoader.java:425) at sun.misc.Launcher $ AppClassLoader.loadClass(Launcher.java:308) at java.lang.ClassLoader.loadClass(ClassLoader.java:358) ......还有6个
提前感谢。
答案 0 :(得分:0)
非常感谢大家。我通过添加依赖来解决我的问题:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.10</artifactId>
<version>1.5.1</version>
</dependency>