我正在尝试将Pandas UDF函数导入到main.py中,但会引发此错误:
AttributeError:'NoneType'对象没有属性'_jvm'
这是3个python文件:test_base.py,test_pandasudf.py和main.py。
# test_base.py
class FP(self, start_date):
def __init__(self, start_date):
spark_conf = pyspark.SparkConf().setMaster("local[*]").setAppName("app").set('spark.jars.packages', 'org.apache.spark:spark-avro_2.11:2.4.5')
self.sc = pyspark.SparkContext.getOrCreate(conf=spark_conf)
self.start_date = start_date
# test_pandasudf.py
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd
import math
import os
class PandasUdfFunc:
@staticmethod
@pandas_udf("double", functionType=PandasUDFType.SCALAR)
def calc_polar_angle(lon1, lon2, lat1, lat2, pin_poi_dist):
r = []
for lo1, lo2, la1, la2, dist in zip(lon1, lon2, lat1, lat2, pin_poi_dist):
# return 0 if distance is less than 1m
if dist < 1:
r.append(0)
else:
# convert decimal degrees to radians
lo1=math.radians(lo1)
lo2=math.radians(lo2)
la1=math.radians(la1)
la2=math.radians(la2)
# polar angle formula
a = math.atan2((lo2 - lo1) * math.cos(0.5 * (la2 + la1)), la2 - la1)
r.append(a)
return pd.Series(r)
# main.py
from test_pandasudf import PandasUdfFunc
from test_base import FP
import pyspark
import os
if __name__ == "__main__":
os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1'
FP = FP("2020-06-20")
sqlContext = pyspark.sql.SQLContext(FP.sc)
df = sqlContext.read.load("testing_file")
df_result = df = df.withColumn('polar_angle', PandasUdfFunc.calc_polar_angle( \
df["A_long"], df["B_long"], df["A_lat"], df["B_lat"], \
df["A_B_distance"]))
df_result.show(1)
我已经进行了一些谷歌搜索,似乎由于装饰器的原因,我必须在main.py中初始化SparkContext / Session,然后再导入PandasUDF类。
如果我错了,请纠正我,但是我相信我已经在这里启动了SparkContext:
FP = FP(“ 2020-06-20”)
请让我知道是否需要提供更多信息。 谢谢!