我如何在pandas_udf中运行此功能

时间:2019-07-05 02:14:58

标签: pandas pyspark user-defined-functions unzip azure-databricks

我有一个自定义的python函数AutoUnzip,想在pandas_udf和一次性UDF中运行它。

当我在下面定义udf并运行它时,它会起作用:

AutoUnzip_udf = udf(lambda x: AutoUnzip(dirInput='/dbfs/mnt/zip', dirOutput='/dbfs/mnt/unzip', ZipFileName = x,ext='.html,.htm') , returnType=StringType())
df.select("file", AutoUnzip_udf("file")).show()

但是当我按照下面的定义转到pandas_udf时,它失败了。

@pandas_udf('string', functionType=PandasUDFType.SCALAR)

def AutoUnzip_udf(x):
    AutoUnzip(dirInput='/dbfs/mnt/zip', dirOutput='/dbfs/mnt/jba_b5/unzip', ZipFileName = x,ext='.html,.htm')

df.select("file", AutoUnzip_udf("file")).show()

整个回溯是

Py4JJavaError                             Traceback (most recent call last)
<command-31569329126536> in <module>()
      6 
      7 
----> 8 (df.select("file", AutoUnzip_udf("file"))).show()

/databricks/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
    377         """
    378         if isinstance(truncate, bool) and truncate:
--> 379             print(self._jdf.showString(n, 20, vertical))
    380         else:
    381             print(self._jdf.showString(n, int(truncate), vertical))

/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling o12758.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 160.0 failed 4 times, most recent failure: Lost task 0.3 in stage 160.0 (TID 581, 10.139.64.6, executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/worker.py", line 403, in main
    process()
  File "/databricks/spark/python/pyspark/worker.py", line 398, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/databricks/spark/python/pyspark/serializers.py", line 296, in dump_stream
    for series in iterator:
  File "<string>", line 1, in <lambda>
  File "/databricks/spark/python/pyspark/worker.py", line 101, in <lambda>
    return lambda *a: (verify_result_length(*a), arrow_return_type)
  File "/databricks/spark/python/pyspark/worker.py", line 92, in verify_result_length
    result = f(*a)
  File "/databricks/spark/python/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<command-31569329126536>", line 5, in AutoUnzip_udf
  File "<command-2540389590283565>", line 17, in AutoUnzip
  File "/usr/lib/python3.5/zipfile.py", line 1026, in __init__
    self._RealGetContents()
  File "/usr/lib/python3.5/zipfile.py", line 1089, in _RealGetContents
    endrec = _EndRecData(fp)
  File "/usr/lib/python3.5/zipfile.py", line 241, in _EndRecData
    fpin.seek(0, 2)
  File "/databricks/python/lib/python3.5/site-packages/pandas/core/generic.py", line 2744, in __getattr__
    return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'seek'

    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:490)
    at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
    at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:444)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:126)
    at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:124)
    at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
    at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:304)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:304)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.doRunTask(Task.scala:139)
    at org.apache.spark.scheduler.Task.run(Task.scala:112)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1481)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2355)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2343)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2342)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2342)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1096)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1096)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1096)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2574)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2510)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:893)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2240)
    at org.apache.spark.sql.execution.collect.Collector.runSparkJobs(Collector.scala:270)
    at org.apache.spark.sql.execution.collect.Collector.collect(Collector.scala:280)
    at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:80)
    at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:86)
    at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:508)
    at org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:55)
    at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectResult(Dataset.scala:2828)
    at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3440)
    at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2557)
    at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2557)
    at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:3424)
    at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:3419)
    at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:99)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:228)
    at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:85)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:158)
    at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3419)
    at org.apache.spark.sql.Dataset.head(Dataset.scala:2557)
    at org.apache.spark.sql.Dataset.take(Dataset.scala:2771)
    at org.apache.spark.sql.Dataset.getRows(Dataset.scala:265)
    at org.apache.spark.sql.Dataset.showString(Dataset.scala:302)
    at sun.reflect.GeneratedMethodAccessor551.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
    at py4j.Gateway.invoke(Gateway.java:295)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:251)
    at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/worker.py", line 403, in main
    process()
  File "/databricks/spark/python/pyspark/worker.py", line 398, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/databricks/spark/python/pyspark/serializers.py", line 296, in dump_stream
    for series in iterator:
  File "<string>", line 1, in <lambda>
  File "/databricks/spark/python/pyspark/worker.py", line 101, in <lambda>
    return lambda *a: (verify_result_length(*a), arrow_return_type)
  File "/databricks/spark/python/pyspark/worker.py", line 92, in verify_result_length
    result = f(*a)
  File "/databricks/spark/python/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<command-31569329126536>", line 5, in AutoUnzip_udf
  File "<command-2540389590283565>", line 17, in AutoUnzip
  File "/usr/lib/python3.5/zipfile.py", line 1026, in __init__
    self._RealGetContents()
  File "/usr/lib/python3.5/zipfile.py", line 1089, in _RealGetContents
    endrec = _EndRecData(fp)
  File "/usr/lib/python3.5/zipfile.py", line 241, in _EndRecData
    fpin.seek(0, 2)
  File "/databricks/python/lib/python3.5/site-packages/pandas/core/generic.py", line 2744, in __getattr__
    return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'seek'

    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:490)
    at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:172)
    at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
    at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:444)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:126)
    at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:124)
    at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
    at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:304)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:304)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.doRunTask(Task.scala:139)
    at org.apache.spark.scheduler.Task.run(Task.scala:112)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1481)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    ... 1 more

如何解决它才能使pandas_udf运行?

0 个答案:

没有答案