我正在pyspark上移植用python开发的预测算法。 在数据清理的各个步骤中,我需要在名为“ label”的数据框列上检测异常值。 为此,我必须:
由于LinearRegression方法不具有皮尔逊残差,但具有GeneralizedLinearRegression have it,因此我选择使用梯形图设置family ='gaussian'和link ='identity'。
我的预期结果是复制statsmodel.regression.linear_model.OLS
和方法resid_pearson
(下面的示例)。
我的结果确实远非如此:我做错了吗?
from pyspark import Row
from pyspark.ml.linalg import DenseVector
from pyspark.ml.regression import GeneralizedLinearRegression
d = [Row(label=972.0, features=DenseVector([3554660.0])), Row(label=878.0, features=DenseVector([3142096.0])), Row(label=842.0, features=DenseVector([2923543.0])), Row(label=846.0, features=DenseVector([2825708.0])), Row(label=892.0, features=DenseVector([2813121.0])), Row(label=1056.0, features=DenseVector([2944565.0])), Row(label=1325.0, features=DenseVector([3310550.0])), Row(label=1682.0, features=DenseVector([4019892.0])), Row(label=1721.0, features=DenseVector([4630756.0])), Row(label=1680.0, features=DenseVector([4863541.0])), Row(label=1612.0, features=DenseVector([4670499.0])), Row(label=1727.0, features=DenseVector([4500632.0])), Row(label=1347.0, features=DenseVector([4295096.0])), Row(label=1373.0, features=DenseVector([4039019.0])), Row(label=1053.0, features=DenseVector([4110518.0])), Row(label=1375.0, features=DenseVector([4167948.0])), Row(label=1362.0, features=DenseVector([4370901.0])), Row(label=1827.0, features=DenseVector([4752816.0])), Row(label=1898.0, features=DenseVector([5406963.0])), Row(label=1933.0, features=DenseVector([6270136.0])), Row(label=1747.0, features=DenseVector([6312119.0])), Row(label=1500.0, features=DenseVector([5659896.0])), Row(label=1230.0, features=DenseVector([5025736.0])), Row(label=1093.0, features=DenseVector([4307496.0])), Row(label=938.0, features=DenseVector([3692721.0])), Row(label=866.0, features=DenseVector([3210317.0])), Row(label=831.0, features=DenseVector([2939295.0])), Row(label=833.0, features=DenseVector([2817028.0])), Row(label=874.0, features=DenseVector([2781300.0])), Row(label=1024.0, features=DenseVector([2854775.0]))]
df = spark.createDataFrame(d).select(['label','features'])
glr = GeneralizedLinearRegression(family="gaussian", link='identity',maxIter=1000)
model_detection = lr.fit(df)
model_detection.summary.residuals(residualsType='pearson').show(30)
#+-------------------+
#| pearsonResiduals|
#+-------------------+
#| -273.0018702844045|
#|-241.68175634480713|
#| -211.2942701321449|
#|-177.57598800554024|
#|-127.75257085291003|
#|-3.6798962052830575|
#| 154.14879001022882|
#| 295.67962580263475|
#| 149.12406089246724|
#| 37.413473629814234|
#| 28.051758684079232|
#| 194.6504240901195|
#|-122.91612330934049|
#| -19.13037657111272|
#| -360.8488561926192|
#|-56.293747515059295|
#|-130.94259008102017|
#| 218.04721194911008|
#| 90.34403723380365|
#|-136.85270971838827|
#|-335.60543255296943|
#|-384.48669055408936|
#| -461.8547511315146|
#| -380.6827374900522|
#| -348.9391703169199|
#| -274.4045132820902|
#| -227.0790851784809|
#|-187.93935807904222|
#|-136.08664909126355|
#| -8.40535690801903|
#+-------------------+
model_detection.summary.residuals(residualsType='pearson').agg(F.stddev('pearsonResiduals')).show()
#+-----------------------------+
#|stddev_samp(pearsonResiduals)|
#+-----------------------------+
#| 400.990196594329|
#+-----------------------------+
### OLS resid_pearson
from statsmodels.regression.linear_model import OLS
pdf = df.toPandas()
pdf['features'] = pdf.features.apply(lambda x: x[0])
m = OLS(pdf.label,pdf.features).fit()
m.resid_pearson
#array([-0.74457447, -0.56452167, -0.400476 , -0.22692427, 0.02179103,
# 0.63161756, 1.39571276, 2.05857723, 1.29336531, 0.72372976,
# 0.68844378, 1.52757899, -0.04095255, 0.49060445, -1.21436348,
# 0.29808452, -0.08534209, 1.62925486, 0.95533831, -0.22601622,
# -1.21769501, -1.42277597, -1.77069777, -1.32461854, -1.13061248,
# -0.73138373, -0.47996194, -0.27799569, -0.01782471, 0.61335826])