在某些数据集上应用RegEx令牌生成器时发生NullPointerException

时间:2019-06-29 22:04:39

标签: java regex apache-spark

我有以下代码。我正在尝试从Wikipedia数据库下载中加载一些数据,这些数据存储在以制表符分隔的csv文件中。 csv文件包含一些元数据和全文。我正在尝试以以下格式[[some word]]来推断单词,但似乎不能正常工作。当我尝试部署count()printschema()时,出现一个我不明白的错误。

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("HelloWorld").master("local")
                .getOrCreate();

    spark.sparkContext().setLogLevel("ERROR");

    printSparkBegin();

    SQLContext context = new org.apache.spark.sql.SQLContext(spark);

    // Read CSV

    StructType schema = new StructType(new StructField[] {
        new StructField("title", DataTypes.StringType, false, Metadata.empty()),
        new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
        new StructField("date", DataTypes.StringType, false, Metadata.empty()),
        new StructField("article", DataTypes.StringType, false, Metadata.empty()),
    });

    // df csv pur
    Dataset<Row> df = context.read()
        .format("com.databricks.spark.csv")
        .schema(schema)
        .option("header", "false")
        .option("delimiter", "\t")
        .option("mode", "DROPMALFORMED")
        .load(
            pathToFolder + "wiki_9.csv"
            // "wiki_9_1.csv"
        );

    showNSchema(df, "df: csv pur", 0, true);

    // Tokenize 1
    RegexTokenizer rtGetWords = new RegexTokenizer()
        .setInputCol("article")
        .setOutputCol("words")
        .setPattern("\\W");

    spark.udf().register("countTokens", (WrappedArray<?> words) -> words.size(), DataTypes.IntegerType);

    Dataset<Row> rtdAllWords = rtGetWords.transform(df);

    showNSchema(rtdAllWords.select("id", "article", "words")
        .withColumn("tokens", callUDF("countTokens", col("words"))),
        "rtdAllWords: Anzahl Wörter der Revisionen", 0, false);

    // Tokenize 2
    String pattern1 = "\\x5b\\x5b[\\w\\s]*\\x5d\\x5d";

    RegexTokenizer rtGetBlueWords = new RegexTokenizer()
        .setInputCol("article")
        .setOutputCol("Blue Words")
        .setGaps(false)
        .setPattern(pattern1)
        .setToLowercase(false);

    Dataset<Row> rtdBlueWords = rtGetBlueWords.transform(df);
    showNSchema(rtdBlueWords, "rtdBlueWords: list", 300, false);

    // Blue Words explode and remove [[...]]
    Dataset<Row> rtdBlueWordsCount = rtdBlueWords.select(
            rtdBlueWords.col("title"),
            rtdBlueWords.col("id"),
            rtdBlueWords.col("date"),
            rtdBlueWords.col("Blue Words"),
            org.apache.spark.sql.functions.size(col("Blue Words")).as("noBW"));

    showNSchema(rtdBlueWordsCount,
            "rtdBlueWordsCount: List with Count",
            100, false);
    // VIELLEICHT WICHTIG ZUERST DIE ZEILEN ZU ELIMINIEREN WO BW-LISTE LEER

    Dataset<Row> rtdBlueWordsCountNoEmpty = 
            rtdBlueWordsCount
            .where(col("noBW").geq(1));

    showNSchema(rtdBlueWordsCountNoEmpty,
            "rtdBlueWordsExploded: not exploded and not truncated WITHOUT empty lines", 100, false);
}


private static void showNSchema(Dataset<Row> set, String name, int lines,
            boolean truncate) {
    int count = (int) set.count();
    System.out.println(
                "*******************************************************************************************");
    System.out.println(name);
    System.out.println("Anzahl Zeilen: "+ count);
    if (lines == 0) {
        set.show(truncate);
    } else {
        set.show(lines, truncate);
    }
    set.printSchema();
    System.out.println(
                "*******************************************************************************************");
}

private static Dataset<Row> readJson(String path, SQLContext context) {
    Dataset<Row> set = context.read().json(path).toDF();
    return set;
}

private static void printSparkBegin() {
    System.out.println("\n" + "Spark commenced" + "\n");
}

private static String[] truncate(String[] stringArr) {
    int i = 0;
    String[] retArr = new String[stringArr.length];

    for (String str : stringArr) {
        if (str.charAt(0) == '['
            && str.charAt(1) == '['
            && str.charAt(str.length() - 1) == ']'
            && str.charAt(str.length() - 2) == ']') {
                retArr[i++] = str.substring(2, str.length() - 2);
        }
    }
    return retArr;
}

这会导致以下异常


19/06/29 22:30:20 ERROR Executor: Exception in task 0.0 in stage 12.0 (TID 28)
org.apache.spark.SparkException: Failed to execute user defined function(RegexTokenizer$$Lambda$1974/499802818: (string) => array<string>)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:636)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
    at java.util.regex.Matcher.getTextLength(Matcher.java:1283)
    at java.util.regex.Matcher.reset(Matcher.java:309)
    at java.util.regex.Matcher.<init>(Matcher.java:229)
    at java.util.regex.Pattern.matcher(Pattern.java:1093)
    at scala.util.matching.Regex$MatchIterator.<init>(Regex.scala:813)
    at scala.util.matching.Regex.findAllIn(Regex.scala:395)
    at org.apache.spark.ml.feature.RegexTokenizer.$anonfun$createTransformFunc$2(Tokenizer.scala:144)
    ... 15 more
19/06/29 22:30:20 ERROR TaskSetManager: Task 0 in stage 12.0 failed 1 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 12.0 failed 1 times, most recent failure: Lost task 0.0 in stage 12.0 (TID 28, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function(RegexTokenizer$$Lambda$1974/499802818: (string) => array<string>)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:636)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
    at java.util.regex.Matcher.getTextLength(Matcher.java:1283)
    at java.util.regex.Matcher.reset(Matcher.java:309)
    at java.util.regex.Matcher.<init>(Matcher.java:229)
    at java.util.regex.Pattern.matcher(Pattern.java:1093)
    at scala.util.matching.Regex$MatchIterator.<init>(Regex.scala:813)
    at scala.util.matching.Regex.findAllIn(Regex.scala:395)
    at org.apache.spark.ml.feature.RegexTokenizer.$anonfun$createTransformFunc$2(Tokenizer.scala:144)
    ... 15 more

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1889)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1877)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1876)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:926)
    at scala.Option.foreach(Option.scala:274)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
    at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:945)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
    at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
    at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:2830)
    at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:2829)
    at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3364)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:78)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
    at org.apache.spark.sql.Dataset.count(Dataset.scala:2829)
    at experiment.Main.showNSchema(Main.java:232)
    at experiment.Main.main(Main.java:110)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(RegexTokenizer$$Lambda$1974/499802818: (string) => array<string>)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$2.hasNext(WholeStageCodegenExec.scala:636)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
    at java.util.regex.Matcher.getTextLength(Matcher.java:1283)
    at java.util.regex.Matcher.reset(Matcher.java:309)
    at java.util.regex.Matcher.<init>(Matcher.java:229)
    at java.util.regex.Pattern.matcher(Pattern.java:1093)
    at scala.util.matching.Regex$MatchIterator.<init>(Regex.scala:813)
    at scala.util.matching.Regex.findAllIn(Regex.scala:395)
    at org.apache.spark.ml.feature.RegexTokenizer.$anonfun$createTransformFunc$2(Tokenizer.scala:144)
    ... 15 more

0 个答案:

没有答案