Question

我正在pyspark上移植用python开发的预测算法。在数据清理的各个步骤中，我需要在名为“ label”的数据框列上检测异常值。为此，我必须：

计算此列与另一列之间的线性回归
将异常值检测为皮尔逊残值大于阈值的行

由于LinearRegression方法不具有皮尔逊残差，但具有GeneralizedLinearRegression have it，因此我选择使用梯形图设置family ='gaussian'和link ='identity'。

我的预期结果是复制statsmodel.regression.linear_model.OLS 和方法resid_pearson（下面的示例）。我的结果确实远非如此：我做错了吗？

from pyspark import Row
from pyspark.ml.linalg import DenseVector
from pyspark.ml.regression import GeneralizedLinearRegression

d = [Row(label=972.0, features=DenseVector([3554660.0])), Row(label=878.0, features=DenseVector([3142096.0])), Row(label=842.0, features=DenseVector([2923543.0])), Row(label=846.0, features=DenseVector([2825708.0])), Row(label=892.0, features=DenseVector([2813121.0])), Row(label=1056.0, features=DenseVector([2944565.0])), Row(label=1325.0, features=DenseVector([3310550.0])), Row(label=1682.0, features=DenseVector([4019892.0])), Row(label=1721.0, features=DenseVector([4630756.0])), Row(label=1680.0, features=DenseVector([4863541.0])), Row(label=1612.0, features=DenseVector([4670499.0])), Row(label=1727.0, features=DenseVector([4500632.0])), Row(label=1347.0, features=DenseVector([4295096.0])), Row(label=1373.0, features=DenseVector([4039019.0])), Row(label=1053.0, features=DenseVector([4110518.0])), Row(label=1375.0, features=DenseVector([4167948.0])), Row(label=1362.0, features=DenseVector([4370901.0])), Row(label=1827.0, features=DenseVector([4752816.0])), Row(label=1898.0, features=DenseVector([5406963.0])), Row(label=1933.0, features=DenseVector([6270136.0])), Row(label=1747.0, features=DenseVector([6312119.0])), Row(label=1500.0, features=DenseVector([5659896.0])), Row(label=1230.0, features=DenseVector([5025736.0])), Row(label=1093.0, features=DenseVector([4307496.0])), Row(label=938.0, features=DenseVector([3692721.0])), Row(label=866.0, features=DenseVector([3210317.0])), Row(label=831.0, features=DenseVector([2939295.0])), Row(label=833.0, features=DenseVector([2817028.0])), Row(label=874.0, features=DenseVector([2781300.0])), Row(label=1024.0, features=DenseVector([2854775.0]))]

df = spark.createDataFrame(d).select(['label','features'])

glr = GeneralizedLinearRegression(family="gaussian", link='identity',maxIter=1000)
model_detection = lr.fit(df)

model_detection.summary.residuals(residualsType='pearson').show(30)
#+-------------------+
#|   pearsonResiduals|
#+-------------------+
#| -273.0018702844045|
#|-241.68175634480713|
#| -211.2942701321449|
#|-177.57598800554024|
#|-127.75257085291003|
#|-3.6798962052830575|
#| 154.14879001022882|
#| 295.67962580263475|
#| 149.12406089246724|
#| 37.413473629814234|
#| 28.051758684079232|
#|  194.6504240901195|
#|-122.91612330934049|
#| -19.13037657111272|
#| -360.8488561926192|
#|-56.293747515059295|
#|-130.94259008102017|
#| 218.04721194911008|
#|  90.34403723380365|
#|-136.85270971838827|
#|-335.60543255296943|
#|-384.48669055408936|
#| -461.8547511315146|
#| -380.6827374900522|
#| -348.9391703169199|
#| -274.4045132820902|
#| -227.0790851784809|
#|-187.93935807904222|
#|-136.08664909126355|
#|  -8.40535690801903|
#+-------------------+


model_detection.summary.residuals(residualsType='pearson').agg(F.stddev('pearsonResiduals')).show()
#+-----------------------------+
#|stddev_samp(pearsonResiduals)|
#+-----------------------------+
#|             400.990196594329|
#+-----------------------------+

### OLS resid_pearson
from statsmodels.regression.linear_model import OLS

pdf = df.toPandas()
pdf['features'] = pdf.features.apply(lambda x: x[0])
m = OLS(pdf.label,pdf.features).fit()
m.resid_pearson
#array([-0.74457447, -0.56452167, -0.400476  , -0.22692427,  0.02179103,
#        0.63161756,  1.39571276,  2.05857723,  1.29336531,  0.72372976,
#        0.68844378,  1.52757899, -0.04095255,  0.49060445, -1.21436348,
#        0.29808452, -0.08534209,  1.62925486,  0.95533831, -0.22601622,
#       -1.21769501, -1.42277597, -1.77069777, -1.32461854, -1.13061248,
#       -0.73138373, -0.47996194, -0.27799569, -0.01782471,  0.61335826])

pyspark从线性回归中获得皮尔逊残差

0 个答案: