我有以下代码。我正在尝试从Wikipedia数据库下载中加载一些数据,这些数据存储在以制表符分隔的csv文件中。 csv文件包含一些元数据和全文。我正在尝试以以下格式[[some word]]
来推断单词,但似乎不能正常工作。当我尝试部署count()
或printschema()
时,出现一个我不明白的错误。
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("HelloWorld").master("local")
.getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
printSparkBegin();
SQLContext context = new org.apache.spark.sql.SQLContext(spark);
// Read CSV
StructType schema = new StructType(new StructField[] {
new StructField("title", DataTypes.StringType, false, Metadata.empty()),
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("date", DataTypes.StringType, false, Metadata.empty()),
new StructField("article", DataTypes.StringType, false, Metadata.empty()),
});
// df csv pur
Dataset<Row> df = context.read()
.format("com.databricks.spark.csv")
.schema(schema)
.option("header", "false")
.option("delimiter", "\t")
.option("mode", "DROPMALFORMED")
.load(
pathToFolder + "wiki_9.csv"
// "wiki_9_1.csv"
);
showNSchema(df, "df: csv pur", 0, true);
// Tokenize 1
RegexTokenizer rtGetWords = new RegexTokenizer()
.setInputCol("article")
.setOutputCol("words")
.setPattern("\\W");
spark.udf().register("countTokens", (WrappedArray<?> words) -> words.size(), DataTypes.IntegerType);
Dataset<Row> rtdAllWords = rtGetWords.transform(df);
showNSchema(rtdAllWords.select("id", "article", "words")
.withColumn("tokens", callUDF("countTokens", col("words"))),
"rtdAllWords: Anzahl Wörter der Revisionen", 0, false);
// Tokenize 2
String pattern1 = "\\x5b\\x5b[\\w\\s]*\\x5d\\x5d";
RegexTokenizer rtGetBlueWords = new RegexTokenizer()
.setInputCol("article")
.setOutputCol("Blue Words")
.setGaps(false)
.setPattern(pattern1)
.setToLowercase(false);
Dataset<Row> rtdBlueWords = rtGetBlueWords.transform(df);
showNSchema(rtdBlueWords, "rtdBlueWords: list", 300, false);
// Blue Words explode and remove [[...]]
Dataset<Row> rtdBlueWordsCount = rtdBlueWords.select(
rtdBlueWords.col("title"),
rtdBlueWords.col("id"),
rtdBlueWords.col("date"),
rtdBlueWords.col("Blue Words"),
org.apache.spark.sql.functions.size(col("Blue Words")).as("noBW"));
showNSchema(rtdBlueWordsCount,
"rtdBlueWordsCount: List with Count",
100, false);
// VIELLEICHT WICHTIG ZUERST DIE ZEILEN ZU ELIMINIEREN WO BW-LISTE LEER
Dataset<Row> rtdBlueWordsCountNoEmpty =
rtdBlueWordsCount
.where(col("noBW").geq(1));
showNSchema(rtdBlueWordsCountNoEmpty,
"rtdBlueWordsExploded: not exploded and not truncated WITHOUT empty lines", 100, false);
}
private static void showNSchema(Dataset<Row> set, String name, int lines,
boolean truncate) {
int count = (int) set.count();
System.out.println(
"*******************************************************************************************");
System.out.println(name);
System.out.println("Anzahl Zeilen: "+ count);
if (lines == 0) {
set.show(truncate);
} else {
set.show(lines, truncate);
}
set.printSchema();
System.out.println(
"*******************************************************************************************");
}
private static Dataset<Row> readJson(String path, SQLContext context) {
Dataset<Row> set = context.read().json(path).toDF();
return set;
}
private static void printSparkBegin() {
System.out.println("\n" + "Spark commenced" + "\n");
}
private static String[] truncate(String[] stringArr) {
int i = 0;
String[] retArr = new String[stringArr.length];
for (String str : stringArr) {
if (str.charAt(0) == '['
&& str.charAt(1) == '['
&& str.charAt(str.length() - 1) == ']'
&& str.charAt(str.length() - 2) == ']') {
retArr[i++] = str.substring(2, str.length() - 2);
}
}
return retArr;
}
这会导致以下异常
19/06/29 22:30:20 ERROR Executor: Exception in task 0.0 in stage 12.0 (TID 28)
org.apache.spark.SparkException: Failed to execute user defined function(RegexTokenizer$$Lambda$1974/499802818: (string) => array<string>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at java.util.regex.Matcher.getTextLength(Matcher.java:1283)
at java.util.regex.Matcher.reset(Matcher.java:309)
at java.util.regex.Matcher.<init>(Matcher.java:229)
at java.util.regex.Pattern.matcher(Pattern.java:1093)
at scala.util.matching.Regex$MatchIterator.<init>(Regex.scala:813)
at scala.util.matching.Regex.findAllIn(Regex.scala:395)
at org.apache.spark.ml.feature.RegexTokenizer.$anonfun$createTransformFunc$2(Tokenizer.scala:144)
... 15 more
19/06/29 22:30:20 ERROR TaskSetManager: Task 0 in stage 12.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 12.0 failed 1 times, most recent failure: Lost task 0.0 in stage 12.0 (TID 28, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function(RegexTokenizer$$Lambda$1974/499802818: (string) => array<string>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at java.util.regex.Matcher.getTextLength(Matcher.java:1283)
at java.util.regex.Matcher.reset(Matcher.java:309)
at java.util.regex.Matcher.<init>(Matcher.java:229)
at java.util.regex.Pattern.matcher(Pattern.java:1093)
at scala.util.matching.Regex$MatchIterator.<init>(Regex.scala:813)
at scala.util.matching.Regex.findAllIn(Regex.scala:395)
at org.apache.spark.ml.feature.RegexTokenizer.$anonfun$createTransformFunc$2(Tokenizer.scala:144)
... 15 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:274)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:2830)
at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:2829)
at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3364)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
at org.apache.spark.sql.Dataset.count(Dataset.scala:2829)
at experiment.Main.showNSchema(Main.java:232)
at experiment.Main.main(Main.java:110)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(RegexTokenizer$$Lambda$1974/499802818: (string) => array<string>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at java.util.regex.Matcher.getTextLength(Matcher.java:1283)
at java.util.regex.Matcher.reset(Matcher.java:309)
at java.util.regex.Matcher.<init>(Matcher.java:229)
at java.util.regex.Pattern.matcher(Pattern.java:1093)
at scala.util.matching.Regex$MatchIterator.<init>(Regex.scala:813)
at scala.util.matching.Regex.findAllIn(Regex.scala:395)
at org.apache.spark.ml.feature.RegexTokenizer.$anonfun$createTransformFunc$2(Tokenizer.scala:144)
... 15 more