嵌套对象上的Scala Spark数据集错误

时间:2019-04-20 02:30:51

标签: scala apache-spark apache-spark-sql

我正在尝试将具有强类型嵌套案例类的dataframe(dataset)代码测试到dataframe中,然后传递给我的函数。数据帧的序列化/创建一直失败,而且我没有足够的经验来了解scala或spark中发生的事情。

我认为我正在尝试确定一个模式,而spark也会确定一个模式,因为那些不匹配错误?

型号:


package io.swagger.client.model

import java.sql.Date
import scala.Enumeration

case class Member (
  memberId: String,
  memberIdSuffix: String,
  memberSubscriberId: String,
  memberEmpi: Option[Long] = None,
  memberFirstName: String,
  memberLastName: String,
  memberMiddleInitial: Option[String] = None,
  memberGender: String,
  memberBirthDate: Date,
  memberSocialSecurityNumber: Option[String] = None,
  memeberPhoneNumbers: List[Telecom],
  memberEmailAddresses: Option[List[Email]] = None,
  memberAddresses: List[Address],
  memberEligibilities: List[MemberEligibility]
)
case class Email (
  address: String,
  effectiveDate: Option[Date] = None,
  terminationDate: Option[Date] = None,
  isCurrent: Option[Boolean] = None,
  isActive: Option[Boolean] = None
)
case class Address (
  lineOne: String,
  lineTwo: String,
  cityName: String,
  stateCode: String,
  zipCode: String,
  effectiveDate: Option[Date] = None,
  terminationDate: Option[Date] = None,
  isCurrent: Option[Boolean] = None,
  isActive: Option[Boolean] = None
)
case class MemberEligibility (
  productId: String,
  productCategoryCode: String,
  classId: String,
  planId: String,
  groupId: String,
  maxCopayAmount: Option[Float] = None,
  voidIndicator: Boolean,
  healthplanEntryDate: Date,
  memberStatusDescription: Option[String] = None,
  eligibilityExplanation: Option[String] = None,
  eligibilitySelectionLevelDescription: Option[String] = None,
  eligibilityReason: Option[String] = None,
  effectiveDate: Option[Date] = None,
  terminationDate: Option[Date] = None,
  isCurrent: Option[Boolean] = None,
  isActive: Option[Boolean] = None
)
case class Telecom (
  phoneNumber: String,
  effectiveDate: Option[Date] = None,
  terminationDate: Option[Date] = None,
  isCurrent: Option[Boolean] = None,
  isActive: Option[Boolean] = None,
  telecomType: String
)


object Genders extends Enumeration {
    val male, female, unknown, other = Value
}
object Gender extends Enumeration  {
    val home, work, fax = Value 
}

测试代码:



import scala.util.{Try, Success, Failure}
import io.swagger.client.model._
import org.apache.spark.sql.{SparkSession, DataFrame, Dataset}
import org.apache.spark.SparkContext
import org.scalatest._
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession

trait SparkContextSetup {
  def withSparkContext(testMethod: (SparkSession, SparkContext) => Any) {

    val spark = org.apache.spark.sql.SparkSession.builder
            .master("local")
            .appName("Spark test")
            .getOrCreate()
    val sparkContext = spark.sparkContext
    try {
      testMethod(spark,sparkContext)
    } finally sparkContext.stop()
  }
}


class HelloSpec extends WordSpec with Matchers with SparkContextSetup {

  "My analytics" should {
    "calculate the right thing" in withSparkContext { (spark, sparkContext) =>
      MockMemberData(spark)
    }
  }

  private def MockMemberData(spark: SparkSession) = {
    import spark.implicits._
    import java.sql.{Date}
    import java.text.SimpleDateFormat
    import org.apache.spark.sql.types._

    var testDate = Try(new SimpleDateFormat("dd/MM/yyyy").parse("01/01/2018"))
      .map(d => new java.sql.Date(d.getTime()))
      .get

    val mockData = spark.sparkContext
      .parallelize(
        Seq(
          Member(
            memberId = "12345",
            memberIdSuffix = "Mr.",
            memberSubscriberId = "000000011",
            memberEmpi = None,
            memberFirstName = "firstname",
            memberLastName = "lastname",
            Some("w"),
            Genders.male.toString,
            testDate,
            Some("123456789"),
            List(
              Telecom("12345678910", None, None, Some(true), Some(true), "")
            ),
            Option(
              List(
                Email(
                  "test@gmail.com",
                  None,
                  Some(testDate),
                  isCurrent = Some(true),
                  isActive = Some(true)
                )
              )
            ),
            List(
              Address(
                "10 Awesome Dr",
                "",
                "St. Louis",
                "MO",
                "63000",
                None,
                None,
                None,
                None
              )
            ),
            List(
              MemberEligibility(
                "productid",
                "productCategoryCode",
                "classId",
                "planId",
                "groupId",
                None,
                false,
                testDate,
                None,
                None,
                None,
                None,
                None,
                None,
                None
              )
            )
          )
        )
      )
      .toDF()
    mockData.show()
  }
}

我希望收到一个数据框的架构(或本例中的数据集,我收到的是:

[info] HelloSpec:
[info] My analytics
[info] - should calculate the right thing *** FAILED ***
[info]   org.apache.spark.sql.AnalysisException: cannot resolve 'wrapoption(staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true), if (isnull(lambdavariable(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true)))) null else newInstance(class io.swagger.client.model.Email), cast(memberEmailAddresses as array<struct<address:string,effectiveDate:date,terminationDate:date,isCurrent:boolean,isActive:boolean>>)).array, true), ObjectType(class scala.collection.immutable.List))' due to data type mismatch: argument 1 requires scala.collection.immutable.List type, however, 'staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true), if (isnull(lambdavariable(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true)))) null else newInstance(class io.swagger.client.model.Email), cast(memberEmailAddresses as array<struct<address:string,effectiveDate:date,terminationDate:date,isCurrent:boolean,isActive:boolean>>)).array, true)' is of scala.collection.Seq type.;
[info]   at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
[info]   at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:82)
[info]   at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:310)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:310)
[info]   at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:309)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:307)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:307)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5$$anonfun$apply$11.apply(TreeNode.scala:360)
[info]   ...

更新

所以而不是

val mockData = spark.sparkContext
      .parallelize(
        Seq(

val mockData = spark.sparkContext
      .parallelize(
        List(

使用数组有效吗?

val mockData = spark.sparkContext
      .parallelize(
        Array(

为什么数组起作用,但Seq和List不起作用?

0 个答案:

没有答案