Pipeline Sklearn(缺少基本)预测错误

时间:2017-09-22 04:16:38

标签: python scikit-learn pipeline

需要一些帮助,我有点坚持使用sklearn 实施管道的概念。该数据集来自Kaggle的KC Housing Dataset。我正在尝试使用Pipelines构建一个简单的线性回归。但是,我遗漏了一些非常基本的概念,因为我无法通过粘贴在这篇文章底部的错误。请指教,非常感谢。这是完整的代码,可以在必要时修改代码。

**ERROR:**
Traceback (most recent call last):
  File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 123, in <module>
    main()
  File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 118, in main
    predictions_some_data = lin_reg.predict(some_data_prepared)
  File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 256, in predict
    return self._decision_function(X)
  File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 241, in _decision_function
    dense_output=True) + self.intercept_
  File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/extmath.py", line 135, in safe_sparse_dot
    ret = a * b
  File "/usr/local/lib/python3.5/dist-packages/scipy/sparse/base.py", line 387, in __mul__
    raise ValueError('dimension mismatch')
ValueError: dimension mismatch

PS:我面临的问题就是这段代码的结尾 “predictions_some_data = lin_reg.predict(some_data_prepared)”

import pandas as pd
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import data_visualize
from sklearn.model_selection import StratifiedShuffleSplit
import dataPrep
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

**## Loading the data**
KC_housing_path = "/media/JBook/Software/PythonProjects/KCH/datasets"
def load_housing_data(housing_path=KC_housing_path):
    '''if not os.path.isfile("datasets/kc_house_data.csv"):
        print("Check file location, program exiting..")
    else:
        csv_path = os.path.join(housing_path, "kc_house_data.csv")
        print("reading csv file ...")
        return pd.read_csv(csv_path)'''
    try:
        csv_path = os.path.join(housing_path, "kc_house_data.csv")
        print("reading csv file -->")
        return pd.read_csv(csv_path)


    except FileNotFoundError:
        print("Check file location, program exiting ...")
        sys.exit()

**### Defining 2 classes for custom transformers**
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributeNames):
        self.attributes = attributeNames
        # print('\n In constructor', self.attributes)

    def fit(self, X, y=None):
        # print("__DF Fit Method:\n", (X[self.attributes].values).shape)
        return self

    def transform(self, X):
        # print("__Transform Method:\n", (X[self.attributes].values).shape)
        return X[self.attributes].values


class LabelBinarizerPipelineFriendly(LabelBinarizer):
    def fit(self, X, y=None):
        # print("LB-->X.shape", X.shape)
        """this would allow us to fit the model based on the X input."""
        super(LabelBinarizerPipelineFriendly, self).fit(X)

    def transform(self, X, y=None):
        # print("LB-Transform-X.shape", X.shape)
        return super(LabelBinarizerPipelineFriendly, self).transform(X)

    def fit_transform(self, X, y=None):
        # print("LB-FIT_TRANSFORM-X.Shape", X.shape)
        return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)


def main():
    # Loading House data
    housing = load_housing_data()
    housing_labels = housing['price']
    # Removing not needed features & label ( price)
    rem_attributes = ['id', 'date', 'price']
    housing_col_removed = housing.drop(rem_attributes, axis=1, inplace=False)

**### Splitting the data** 
    train_set, test_set = train_test_split(housing_col_removed, test_size=0.3, random_state=42)

**#### Pipleline for numeric & categorical attribute transformations 
#### Adding median to missing values & making one hot vectors of categorical attributes**

    data_numeric = housing_col_removed.drop('ocean_proximity', axis=1, inplace=False)
    numeric_attrib = list(data_numeric)
    cat_attrib = ['ocean_proximity']

    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(numeric_attrib)),
        ('imputing', Imputer(missing_values=0, strategy='median')),
    ])

    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attrib)),
        ('label_Bin', LabelBinarizerPipelineFriendly(sparse_output=True)),
    ])

    full_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline),
    ])

**#### Fitting the linear regression model**
    # print('This housing data is passed to prepare\n', housing_col_removed.head())
    housing_prepared = dataPrep.prepData(housing_col_removed)
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)
    print('Housing Prepared Shape: \n', housing_prepared.shape)
    print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n',lin_reg.predict(housing_prepared))

**### Below section is trying to use some data (5 rows ) from whole data set to predict values** 

    some_data = housing_col_removed[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.fit_transform(some_data)
    print('\t\t\tSome Data Prepared is\n', some_data_prepared)
    predictions_some_data = lin_reg.predict(some_data_prepared)
    print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n', predictions_some_data)
    # print('\t\t\t\t\************* Labels Are ***********\n', list(some_labels))


main()

0 个答案:

没有答案