Java中的SparkSQL中的“无法访问可序列化”错误

时间:2018-07-05 08:35:11

标签: java apache-spark serialization apache-spark-sql

使用Java我试图执行一个简单的spark-sql代码,但是 “无法访问可序列化”错误,请问有没有指针? 我想我已经导入了所有必需的软件包,是否缺少任何内容? 有时我还会收到“无法访问可克隆错误” 请让我知道如何解决该问题

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.sql.*;

import static org.apache.spark.sql.functions.avg;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.max;

public class StackOverFlowSurvey {

    private static final String AGE_MIDPOINT = "age_midpoint";
    private static final String SALARY_MIDPOINT = "salary_midpoint";
    private static final String SALARY_MIDPOINT_BUCKET = "salary_midpoint_bucket";

    public static void main(String[] args) throws Exception {

        Logger.getLogger("org").setLevel(Level.ERROR);
        SparkSession session = SparkSession.builder().appName("StackOverFlowSurvey").master("local[1]").getOrCreate();

        DataFrameReader dataFrameReader = session.read();

        Dataset<Row> responses = dataFrameReader.option("header","true").csv("in/2016-stack-overflow-survey-responses.csv");

        System.out.println("=== Print out schema ===");
        responses.printSchema();

        System.out.println("=== Print 20 records of responses table ===");
        responses.show(20);

        System.out.println("=== Print the so_region and self_identification columns of gender table ===");
        responses.select(col("so_region"),  col("self_identification")).show();

        System.out.println("=== Print records where the response is from Afghanistan ===");
        responses.filter(col("country").equalTo("Afghanistan")).show();

        System.out.println("=== Print the count of occupations ===");
        RelationalGroupedDataset groupedDataset = responses.groupBy(col("occupation"));
        groupedDataset.count().show();

        System.out.println("=== Cast the salary mid point and age mid point to integer ===");
        Dataset<Row> castedResponse = responses.withColumn(SALARY_MIDPOINT, col(SALARY_MIDPOINT).cast("integer"))
                                               .withColumn(AGE_MIDPOINT, col(AGE_MIDPOINT).cast("integer"));

        System.out.println("=== Print out casted schema ===");
        castedResponse.printSchema();

        System.out.println("=== Print records with average mid age less than 20 ===");
        castedResponse.filter(col(AGE_MIDPOINT).$less(20)).show();

        System.out.println("=== Print the result by salary middle point in descending order ===");
        castedResponse.orderBy(col(SALARY_MIDPOINT ).desc()).show();

        System.out.println("=== Group by country and aggregate by average salary middle point and max age middle point ===");
        RelationalGroupedDataset datasetGroupByCountry = castedResponse.groupBy("country");
        datasetGroupByCountry.agg(avg(SALARY_MIDPOINT), max(AGE_MIDPOINT)).show();


        Dataset<Row> responseWithSalaryBucket = castedResponse.withColumn(
                SALARY_MIDPOINT_BUCKET, col(SALARY_MIDPOINT).divide(20000).cast("integer").multiply(20000));

        System.out.println("=== With salary bucket column ===");
        responseWithSalaryBucket.select(col(SALARY_MIDPOINT), col(SALARY_MIDPOINT_BUCKET)).show();

        System.out.println("=== Group by salary bucket ===");
        responseWithSalaryBucket.groupBy(SALARY_MIDPOINT_BUCKET).count().orderBy(col(SALARY_MIDPOINT_BUCKET)).show();

        session.stop();
    }
}

0 个答案:

没有答案