我正在尝试计算pyspark线性回归模型的偏差和方差。我从三阶多项式开始,添加一些噪声,并使用具有不同阶数多项式展开的线性回归模型进行拟合。目的是表明随着多项式展开程度的增加,偏差减少而方差增加。在下面的代码中,模型偏差保持恒定,因为对于1、2和3多项式,预测的均值是相同的。我必须计算偏差是错误的,而且我还想知道自己是否正确计算了方差。有人可以验证我是否正确(或不正确)计算偏差,并且可以帮助我弄清楚为什么无论多项式展开度如何偏差都保持不变。欢迎对代码中有误的所有注释。
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, Pipeline
from pyspark.sql import functions as fn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
# create numpy arrays for x and y data
x = np.linspace(-15, 15, 250)
y = 10 + 5*x + 0.5*np.square(x) - 0.1*np.power(x,3)
reducible_error = np.random.uniform(-50, 50, len(x))
irreducible_error = np.random.normal(0, 8, len(x))
y_noise = y + reducible_error + irreducible_error
# plot x and y data
%matplotlib inline
plt.figure()
plt.plot(x,y, c='r', label="y")
plt.scatter(x, y_noise, label="y_noise")
plt.legend()
plt.title("10 + 5x + 0.5x^2 - 0.1x^3")
plt.xlabel("x")
plt.ylabel("y, y_noise")
# create a pandas dataframe from the x, y, y_hat data arrays
pd_df = pd.DataFrame({'x': x, 'y_noise': y_noise, 'y': y}, columns=['x', 'y_noise', 'y'])
# create a spark dataframe from the pandas dataframe
df = spark.createDataFrame(pd_df)
df.show()
def get_bias_squared(df):
f_hat_mean = np.mean(df['prediction'])
return np.mean(np.square(df['y_noise'] - f_hat_mean))
def get_variance(df):
f_hat_mean = np.mean(df['prediction'])
diff = df['prediction'] - f_hat_mean
return np.mean(np.square(diff))
def plot_poly_expansion(n, df, lambda_reg=0., alpha_reg=0.):
# create the pipeline
va = feature.VectorAssembler(inputCols=['x'], outputCol='features')
pe = feature.PolynomialExpansion(degree=n, inputCol='features', outputCol='poly_features')
lr = regression.LinearRegression(featuresCol='poly_features', labelCol='y_noise', regParam=lambda_reg,
elasticNetParam=alpha_reg)
pipe = Pipeline(stages=[va, pe, lr]).fit(df)
# fit the pipeline
fit_df = pipe.transform(df)
# convert the fitted spark dataframe to pandas and plot predicted vs. actual
fit_pd_df = fit_df.toPandas()
# display(fit_pd_df.head())
fit_pd_df.plot(x='x', y=['y', 'y_noise', 'prediction'])
plt.title("Polynomial degree = %s\nBias = %s, Variance = %s" % (i, get_bias_squared(fit_pd_df),
get_variance(fit_pd_df)))
plt.xlabel("x")
plt.ylabel("y")
return fit_pd_df
for i in np.arange(1, 4):
plot_poly_expansion(float(i), df)