我需要创建我自己的UnaryTransformer实例,该实例接受Array [String]类型的Dataframe列,并且还应该输出相同的类型。在尝试这样做时,我在Spark版本2.1.0上遇到了ClassCastException。 我已经整理了一个展示我案例的样本测试。
import org.apache.spark.SparkConf
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
class MyTransformer(override val uid:String) extends UnaryTransformer[Array[String],Array[String],MyTransformer] {
override protected def createTransformFunc: (Array[String]) => Array[String] = {
param1 => {
param1.foreach(println(_))
param1
}
}
override protected def outputDataType: DataType = ArrayType(StringType)
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == ArrayType(StringType), s"Data type mismatch between Array[String] and provided type $inputType.")
}
def this() = this( Identifiable.randomUID("tester") )
}
object Tester {
def main(args: Array[String]): Unit = {
val config = new SparkConf().setAppName("Tester")
implicit val sparkSession = SparkSession.builder().config(config).getOrCreate()
import sparkSession.implicits._
val dataframe = Seq(Array("Firstly" , "F1"),Array("Driving" , "S1" ),Array("Ran" , "T3" ),Array("Fourth" ,"F4"), Array("Running" , "F5")
,Array("Gone" , "S6")).toDF("input")
val transformer = new MyTransformer().setInputCol("input").setOutputCol("output")
val transformed = transformer.transform(dataframe)
transformed.select("output").show()
println("Complete....")
sparkSession.close()
}
}
附加堆栈跟踪以供参考
线程中的异常" main" org.apache.spark.SparkException:失败 执行用户定义的函数($ anonfun $ createTransformFunc $ 1: (array)=>阵列) org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1072) 在 org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:144) 在 org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:48) 在 org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:30) 在 scala.collection.TraversableLike $$ anonfun $表$ 1.适用(TraversableLike.scala:234) 在 scala.collection.TraversableLike $$ anonfun $表$ 1.适用(TraversableLike.scala:234) 在scala.collection.immutable.List.foreach(List.scala:392)at scala.collection.TraversableLike $ class.map(TraversableLike.scala:234) 在scala.collection.immutable.List.map(List.scala:296)at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation $$ anonfun $ $申请21.applyOrElse(Optimizer.scala:1078) 在 org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation $$ anonfun $ $申请21.applyOrElse(Optimizer.scala:1073) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ 3.apply(TreeNode.scala:288) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ 3.apply(TreeNode.scala:288) 在 org.apache.spark.sql.catalyst.trees.CurrentOrigin $ .withOrigin(TreeNode.scala:70) 在 org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:287) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ transformDown $ 1.适用(TreeNode.scala:293) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ transformDown $ 1.适用(TreeNode.scala:293) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ 5.apply(TreeNode.scala:331) 在 org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188) 在 org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329) 在 org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ transformDown $ 1.适用(TreeNode.scala:293) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ transformDown $ 1.适用(TreeNode.scala:293) 在 org.apache.spark.sql.catalyst.trees.TreeNode $$ anonfun $ 5.apply(TreeNode.scala:331) 在 org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188) 在 org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329) 在 org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293) 在 org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:277) 在 org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation $。适用(Optimizer.scala:1073) 在 org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation $。适用(Optimizer.scala:1072) 在 org.apache.spark.sql.catalyst.rules.RuleExecutor $$ anonfun $执行$ 1 $$ anonfun $ $适用1.适用(RuleExecutor.scala:85) 在 org.apache.spark.sql.catalyst.rules.RuleExecutor $$ anonfun $执行$ 1 $$ anonfun $ $适用1.适用(RuleExecutor.scala:82) 在 scala.collection.IndexedSeqOptimized $ class.foldl(IndexedSeqOptimized.scala:57) 在 scala.collection.IndexedSeqOptimized $ class.foldLeft(IndexedSeqOptimized.scala:66) 在 scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35) 在 org.apache.spark.sql.catalyst.rules.RuleExecutor $$ anonfun $执行$ 1.适用(RuleExecutor.scala:82) 在 org.apache.spark.sql.catalyst.rules.RuleExecutor $$ anonfun $执行$ 1.适用(RuleExecutor.scala:74) 在scala.collection.immutable.List.foreach(List.scala:392)at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74) 在 org.apache.spark.sql.execution.QueryExecution.optimizedPlan $ lzycompute(QueryExecution.scala:73) 在 org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:73) 在 org.apache.spark.sql.execution.QueryExecution.sparkPlan $ lzycompute(QueryExecution.scala:79) 在 org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:75) 在 org.apache.spark.sql.execution.QueryExecution.executedPlan $ lzycompute(QueryExecution.scala:84) 在 org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:84) at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2791) 在org.apache.spark.sql.Dataset.head(Dataset.scala:2112)at org.apache.spark.sql.Dataset.take(Dataset.scala:2327)at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)at at org.apache.spark.sql.Dataset.show(Dataset.scala:636)at at org.apache.spark.sql.Dataset.show(Dataset.scala:595)at at org.apache.spark.sql.Dataset.show(Dataset.scala:604)at Tester $ .main(Tester.scala:45)在Tester.main(Tester.scala) 引起:java.lang.ClassCastException:scala.collection.mutable.WrappedArray $ ofRef无法强制转换为 [Ljava.lang.String;在 MyTransformer $$ anonfun $ $ createTransformFunc 1.适用(Tester.scala:9) 在 org.apache.spark.sql.catalyst.expressions.ScalaUDF $$ anonfun $ 2.适用(ScalaUDF.scala:89) 在 org.apache.spark.sql.catalyst.expressions.ScalaUDF $$ anonfun $ 2.适用(ScalaUDF.scala:88) 在 org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1069) ... 53更多