我有一个简单的pyspark pandas udf函数。 在短时间内,它可以正常工作,并且可以长时间工作。它抱怨不能将781127.0转换为double。该数字来自MARKET_VALUE_LOCAL,是输入的两倍。我不明白发生了什么。有身体可以帮助吗?谢谢
dat = dat.join(ids_df, on=['symbol']). \
filter("date>=start_date and date <=last_date and date<=add_months(maturity_date, -6) "). \
drop("sid", "start_date", "last_date","maturity_date")
logging.info("Beginning clearning data...")
schema = """
date date, symbol string, TOTAL_RETURN_MTD_LOCAL double, EXCESS_RETURN_MTD double,
TOTAL_RETURN_DAILY double, EXCESS_RETURN_DAILY double, TOTAL_RETURN double, price double,
cpn_amount double, OAS double, OAD double, OASD double, CLASS_2 string, CLASS_3 string,
CLASS_4 string, INDEX_RATING string, YIELD_TO_WORST double, YIELD_TO_MAT double,
MARKET_VALUE_LOCAL double, TICKER string, AMOUNT_OUTSTANDING double
"""
prepare = pandas_udf(schema, PandasUDFType.GROUPED_MAP)(prepare_data)
pyarrow.lib.ArrowInvalid:无法转换类型为str的781127.0:尝试转换为double
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage6.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)