val airlinesDf = spark.read.csv("input file")
val airlinesData : H2OFrame = airlinesDf
val airlinesTable: RDD[Airlines] = asRDD[Airlines](airlinesDf)
val flightsToORD = airlinesTable.filter(f => f.Dest == Some("ORD"))
flightsToORD.count()
在flightsToORD上执行任何函数时,我们收到以下错误消息。 我们只能在这里从输入文件创建Spark数据框,这就是我们创建spark数据框并将其转换为H2O的原因
我在Spark 2.0.2中使用Sparkling-water-2.0.25
airlinesDf: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [ActualElapsedTime: string, AirTime: string ... 29 more fields]
airlinesTable: org.apache.spark.h2o.RDD[org.apache.spark.examples.h2o.Airlines] = H2ORDD[1420] at RDD at H2ORDD.scala:52
flightsToORD: org.apache.spark.rdd.RDD[org.apache.spark.examples.h2o.Airlines] = MapPartitionsRDD[1421] at filter at <console>:412
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 253.0 failed 4 times, most recent failure: Lost task 0.3 in stage 253.0 (TID 2351, dn4.h2oitc.ad.phemi.com): java.lang.IllegalArgumentException: Operation not allowed on string vector.
at water.fvec.CStrChunk.at8_impl(CStrChunk.java:89)
at water.fvec.Chunk.at8(Chunk.java:266)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.longAt(InternalReadConverterCtx.scala:60)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.intAt(InternalReadConverterCtx.scala:59)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.intAt(InternalReadConverterCtx.scala:29)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$ExtractorsTable$4.apply(ReadConverterCtx.scala:108)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$ExtractorsTable$4.apply(ReadConverterCtx.scala:108)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx$$anonfun$returnOption$2.apply(InternalReadConverterCtx.scala:47)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx$$anonfun$returnOption$2.apply(InternalReadConverterCtx.scala:46)
at scala.Option$WithFilter.flatMap(Option.scala:208)
at org.apache.spark.h2o.backends.internal.InternalReadConverterCtx.returnOption(InternalReadConverterCtx.scala:46)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$org$apache$spark$h2o$converters$ReadConverterCtx$$OptionReadersMap$1$$anonfun$apply$1.apply(ReadConverterCtx.scala:120)
at org.apache.spark.h2o.converters.ReadConverterCtx$$anonfun$org$apache$spark$h2o$converters$ReadConverterCtx$$OptionReadersMap$1$$anonfun$apply$1.apply(ReadConverterCtx.scala:120)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator$$anonfun$5$$anonfun$apply$1.apply(H2ORDD.scala:129)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator$$anonfun$6.apply(H2ORDD.scala:133)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator$$anonfun$6.apply(H2ORDD.scala:133)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator.extractRow(H2ORDD.scala:133)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator.readOneRow(H2ORDD.scala:189)
at org.apache.spark.h2o.converters.H2ORDD$H2ORDDIterator.hasNext(H2ORDD.scala:157)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:461)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1763)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1134)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1899)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)