我想将一次UDF与Pandas UDF的性能进行比较。这是我的代码:
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import SQLContext
sc.install_pypi_package("scipy")
sc.install_pypi_package("pandas")
sc.install_pypi_package("PyArrow==0.14.1")
import scipy as sp
import numpy as np
import pandas as pd
import itertools
import time
from scipy.stats import beta
def expand_grid(data_dict):
rows = itertools.product(*data_dict.values())
return pd.DataFrame.from_records(rows, columns=data_dict.keys())
np.random.seed(123)
gd = int(1e6)
grid = pd.DataFrame(data={'q': np.random.random(gd),
'a': np.random.random(gd),
'b': np.random.random(gd)})
#
# create spark data frame
#
grid_spark = spark.createDataFrame(grid)
#
# one-at-a-time
#
def qbeta(q,a,b):
return beta.ppf(q,a,b).tolist()
qbeta_spark = F.udf(qbeta, DoubleType())
#
# TIME one-at-a-time
#
st = time.time()
x = grid_spark.select(qbeta_spark("q","a","b")).rdd.count()
time.time() - st
#
# Pandas
#
@F.pandas_udf("double", F.PandasUDFType.SCALAR)
def qbeta_scalar(q,a,b):
return pd.Series(beta.ppf(q,a,b))
#
# TIME Pandas
#
st = time.time()
x = grid_spark.select(qbeta_scalar(grid_spark.quantile, grid_spark.mdr, grid_spark.sdr)).rdd.count()
time.time() - st
我知道Spark中的惰性评估,因此需要触发数据帧评估。我做错了什么吗?我注意到,如果我在第一次评估后第二次运行脚本,则时间要短得多。似乎在后台进行某种缓存。如何正确对PySpark UDF进行基准测试?