我尝试从两个变量'date_birth'和'date_survey'创建一个新变量'age'
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model, pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
我的数据框
df = pd.DataFrame({'a':[1,2,3],
'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'],
'birth': ['1985', '1984', '1986'] })
管道代码
X = df[['date_survey', 'birth']]
y = df['a']
class MultiColumn:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
class Age(TransformerMixin):
def transform(self, X, y=None, **fit_params):
X['date_survey'] = pd.to_datetime(X['date_survey'])
year = pd.DataFrame(X['date_survey'].apply(lambda x: x.year))
age = X['birth'].convert_objects(convert_numeric=True) - year
return age
def fit(self, X, y=None, **fit_params):
return self
regressor = linear_model.SGDRegressor()
pipeline = Pipeline([
('union', FeatureUnion(
transformer_list=[
# age
('age', Pipeline([
('selector', MultiColumn(columns=['date_survey', 'birth'])),
('date', Age())
])),
])),
# Use a regression
('model_fitting', regressor),
])
pipeline.fit(X, y)
我收到错误
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
我猜是Age类中的错误,但我无法理解如何改进它
答案 0 :(得分:1)
date_survey birth date_survey_in_transform year
0 10.01.2013 1985 2013-10-01 2013
1 20.02.2014 1984 2014-02-20 2014
2 30.03.2015 1986 2015-03-30 2015
birth
- year
是否定的。
age = X['birth'].convert_objects(convert_numeric=True) - year
我修改了一些代码,让它运行没有错误。
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model, pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
df = pd.DataFrame({'a':[1,2,3],
'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'],
'birth': ['1985', '1984', '1986'] })
X = df[['date_survey', 'birth']]
y = df['a']
class MultiColumn:
def __init__(self,columns=None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
class Age(TransformerMixin):
def transform(self, X, y=None, **fit_params):
X['date'] = pd.to_datetime(X['date_survey'])
X['year'] = X['date'].dt.year
X['age'] = X['year'] - X['birth'].astype('int64')
return X['age'].reshape(-1, 1)
def fit(self, X, y=None, **fit_params):
return self
pipeline = Pipeline([
('union', FeatureUnion(
transformer_list=[
# age
('age', Pipeline([
('selector', MultiColumn(columns=['date_survey', 'birth'])),
('date', Age())
])
),
]
)
),
# Use a regression
('model_fitting', SGDRegressor())
])
pipeline.fit(X, y)