需要一些帮助,我有点坚持使用sklearn 实施管道的概念。该数据集来自Kaggle的KC Housing Dataset。我正在尝试使用Pipelines构建一个简单的线性回归。但是,我遗漏了一些非常基本的概念,因为我无法通过粘贴在这篇文章底部的错误。请指教,非常感谢。这是完整的代码,可以在必要时修改代码。
**ERROR:**
Traceback (most recent call last):
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 123, in <module>
main()
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 118, in main
predictions_some_data = lin_reg.predict(some_data_prepared)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 256, in predict
return self._decision_function(X)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 241, in _decision_function
dense_output=True) + self.intercept_
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/extmath.py", line 135, in safe_sparse_dot
ret = a * b
File "/usr/local/lib/python3.5/dist-packages/scipy/sparse/base.py", line 387, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
PS:我面临的问题就是这段代码的结尾 “predictions_some_data = lin_reg.predict(some_data_prepared)”
import pandas as pd
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import data_visualize
from sklearn.model_selection import StratifiedShuffleSplit
import dataPrep
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
**## Loading the data**
KC_housing_path = "/media/JBook/Software/PythonProjects/KCH/datasets"
def load_housing_data(housing_path=KC_housing_path):
'''if not os.path.isfile("datasets/kc_house_data.csv"):
print("Check file location, program exiting..")
else:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file ...")
return pd.read_csv(csv_path)'''
try:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file -->")
return pd.read_csv(csv_path)
except FileNotFoundError:
print("Check file location, program exiting ...")
sys.exit()
**### Defining 2 classes for custom transformers**
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributes = attributeNames
# print('\n In constructor', self.attributes)
def fit(self, X, y=None):
# print("__DF Fit Method:\n", (X[self.attributes].values).shape)
return self
def transform(self, X):
# print("__Transform Method:\n", (X[self.attributes].values).shape)
return X[self.attributes].values
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit(self, X, y=None):
# print("LB-->X.shape", X.shape)
"""this would allow us to fit the model based on the X input."""
super(LabelBinarizerPipelineFriendly, self).fit(X)
def transform(self, X, y=None):
# print("LB-Transform-X.shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).transform(X)
def fit_transform(self, X, y=None):
# print("LB-FIT_TRANSFORM-X.Shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)
def main():
# Loading House data
housing = load_housing_data()
housing_labels = housing['price']
# Removing not needed features & label ( price)
rem_attributes = ['id', 'date', 'price']
housing_col_removed = housing.drop(rem_attributes, axis=1, inplace=False)
**### Splitting the data**
train_set, test_set = train_test_split(housing_col_removed, test_size=0.3, random_state=42)
**#### Pipleline for numeric & categorical attribute transformations
#### Adding median to missing values & making one hot vectors of categorical attributes**
data_numeric = housing_col_removed.drop('ocean_proximity', axis=1, inplace=False)
numeric_attrib = list(data_numeric)
cat_attrib = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(numeric_attrib)),
('imputing', Imputer(missing_values=0, strategy='median')),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attrib)),
('label_Bin', LabelBinarizerPipelineFriendly(sparse_output=True)),
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
**#### Fitting the linear regression model**
# print('This housing data is passed to prepare\n', housing_col_removed.head())
housing_prepared = dataPrep.prepData(housing_col_removed)
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
print('Housing Prepared Shape: \n', housing_prepared.shape)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n',lin_reg.predict(housing_prepared))
**### Below section is trying to use some data (5 rows ) from whole data set to predict values**
some_data = housing_col_removed[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.fit_transform(some_data)
print('\t\t\tSome Data Prepared is\n', some_data_prepared)
predictions_some_data = lin_reg.predict(some_data_prepared)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n', predictions_some_data)
# print('\t\t\t\t\************* Labels Are ***********\n', list(some_labels))
main()