AWS Glue错误将数据框架转换为动态框架

时间:2019-06-16 21:25:35

标签: python pyspark apache-spark-sql pyspark-sql aws-glue

这是我的代码,我试图从其他2个数据帧的左联接结果集中创建一个新数据帧,然后尝试将其转换为动态帧。

dfs = sqlContext.read.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("query", "SELECT hashkey as hash From randomtable").load()

#Source
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "test", table_name = "randomtable", transformation_ctx = "datasource0")

#add hash value
df = datasource0.toDF()
df.cache()
df = df.withColumn("hashkey", sha2(concat_ws("||", *df.columns), 256))

#drop dupes
df1 = df.dropDuplicates(subset=['hashkey'])

#read incremental data
inc = df1.join(dfs, df1["hashkey"] == dfs["hash"], how='left').filter(col('hash').isNull())

#convert it back to glue context
datasource1 = DynamicFrame.fromDF(inc, glueContext, "datasource1")

这是我尝试将数据帧转换为动态帧时遇到的错误。

  

datasource1 = DynamicFrame.fromDF(inc,gumContext,“ datasource1”)   文件   “ /mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/PyGlue.zip/awsglue/dynamicframe.py”,

     

第150行,来自DF       文件“ /mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/py4j-0.10.4-src.zip/py4j/java_gateway.py”,   第1133行,在致电       文件“ /mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/pyspark.zip/pyspark/sql/utils.py”,   第63行,在装饰中       文件“ /mnt/yarn/usercache/root/appcache/application_1560272525947_0002/container_1560272525947_0002_01_000001/py4j-0.10.4-src.zip/py4j/protocol.py”,   第319行,位于get_return_value中       py4j.protocol.Py4JJavaError:调用z:com.amazonaws.services.glue.DynamicFrame.apply时发生错误。       :java.lang.NoSuchMethodError:org.apache.spark.sql.catalyst.expressions.AttributeReference。(Ljava / lang / String; Lorg / apache / spark / sql / types / DataType; ZLorg / apache / spark / sql / types /元数据; Lorg / apache / spark / sql / catalyst / expressions / ExprId; Lscala / collection / Seq;)V       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryHelper $$ anonfun $ 8.apply(QueryHelper.scala:66)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryHelper $$ anonfun $ 8.apply(QueryHelper.scala:65)       在scala.collection.TraversableLike $$ anonfun $ map $ 1.apply(TraversableLike.scala:234)       在scala.collection.TraversableLike $$ anonfun $ map $ 1.apply(TraversableLike.scala:234)       在scala.collection.immutable.List.foreach(List.scala:381)       在scala.collection.TraversableLike $ class.map(TraversableLike.scala:234)       在scala.collection.immutable.List.map(List.scala:285)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryHelper。(QueryHelper.scala:64)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.SourceQuery。(SnowflakeQuery.scala:100)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.net $ snowflake $ spark $ snowflake $ pushdowns $ querygeneration $ QueryBuilder $$ generateQueries(QueryBuilder.scala:98)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.liftedTree1 $ 1(QueryBuilder.scala:63)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.treeRoot $ lzycompute(QueryBuilder.scala:61)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.treeRoot(QueryBuilder.scala:60)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.tryBuild $ lzycompute(QueryBuilder.scala:34)处       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder.tryBuild(QueryBuilder.scala:33)       在net.snowflake.spark.snowflake.pushdowns.querygeneration.QueryBuilder $ .getRDDFromPlan(QueryBuilder.scala:179)处       在net.snowflake.spark.snowflake.pushdowns.SnowflakeStrategy.buildQueryRDD(SnowflakeStrategy.scala:42)       在net.snowflake.spark.snowflake.pushdowns.SnowflakeStrategy.apply(SnowflakeStrategy.scala:24)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 1.apply(QueryPlanner.scala:62)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 1.apply(QueryPlanner.scala:62)       在scala.collection.Iterator $$ anon $ 12.nextCur(Iterator.scala:434)       在scala.collection.Iterator $$ anon $ 12.hasNext(Iterator.scala:440)       在scala.collection.Iterator $$ anon $ 12.hasNext(Iterator.scala:439)       在org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2 $$ anonfun $ apply $ 2.apply(QueryPlanner.scala:77)       位于org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2 $$ anonfun $ apply $ 2.apply(QueryPlanner.scala:74)       在scala.collection.TraversableOnce $$ anonfun $ foldLeft $ 1.apply(TraversableOnce.scala:157)       在scala.collection.TraversableOnce $$ anonfun $ foldLeft $ 1.apply(TraversableOnce.scala:157)       在scala.collection.Iterator $ class.foreach(Iterator.scala:893)       在scala.collection.AbstractIterator.foreach(Iterator.scala:1336)       在scala.collection.TraversableOnce $ class.foldLeft(TraversableOnce.scala:157)       在scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2.apply(QueryPlanner.scala:74)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2.apply(QueryPlanner.scala:66)       在scala.collection.Iterator $$ anon $ 12.nextCur(Iterator.scala:434)       在scala.collection.Iterator $$ anon $ 12.hasNext(Iterator.scala:440)       在org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2 $$ anonfun $ apply $ 2.apply(QueryPlanner.scala:77)       位于org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2 $$ anonfun $ apply $ 2.apply(QueryPlanner.scala:74)       在scala.collection.TraversableOnce $$ anonfun $ foldLeft $ 1.apply(TraversableOnce.scala:157)       在scala.collection.TraversableOnce $$ anonfun $ foldLeft $ 1.apply(TraversableOnce.scala:157)       在scala.collection.Iterator $ class.foreach(Iterator.scala:893)       在scala.collection.AbstractIterator.foreach(Iterator.scala:1336)       在scala.collection.TraversableOnce $ class.foldLeft(TraversableOnce.scala:157)       在scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2.apply(QueryPlanner.scala:74)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2.apply(QueryPlanner.scala:66)       在scala.collection.Iterator $$ anon $ 12.nextCur(Iterator.scala:434)       在scala.collection.Iterator $$ anon $ 12.hasNext(Iterator.scala:440)       在org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)       在org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2 $$ anonfun $ apply $ 2.apply(QueryPlanner.scala:77)       位于org.apache.spark.sql.catalyst.planning.QueryPlanner $$ anonfun $ 2 $$ anonfun $ apply $ 2.apply(QueryPlanner.scala:74)       在scala.collection.TraversableOnce $$ anonfun $ foldLeft $ 1.apply(TraversableOnce.scala:157)

非常感谢您的帮助。

0 个答案:

没有答案