当我尝试在测试数据上使用pipeline.predict方法时,该方法将返回与训练数据长度相同的一维数组。
我不知道如何解决
这是我的变压器管道:
pipeline_feat_union = Pipeline([('preprocess', preprocess()),
('feat_union', feature_union()),
('classifier', GaussianNB())])
pipeline_feat_union.fit(X_train, y_train)
accuracy_score_feat_union.append(accuracy_score(y_test, pipeline_feat_union.predict(X_test)))
所以我的训练数据的尺寸为(33401,127),而我的测试数据为(11134,127)。但是当我打电话时:
pipeline_feat_union.predict(X_test)
我有点昏暗(33401)->显然有问题。
这是我的两个变形金刚(我怀疑这可能与feature_union()有关:
class preprocess(TransformerMixin, BaseEstimator):
def __init__(self):
self.X = X
self.PI2 = 'Product_Info_2'
def fit(self, X, y=None):
self.X = X
self.PI2_categories = list(training_data[self.PI2].unique())
return self
def transform(self, X, y=None):
Xt = X.copy()
Xt = pd.concat([Xt, pd.get_dummies(Xt[self.PI2])], axis=1).drop(self.PI2, axis=1)
Xt.drop('Id', axis=1, inplace=True)
Xt.fillna(value=0, inplace=True)
return Xt
class feature_union(TransformerMixin, BaseEstimator):
def __init__(self):
self.Xt = None
self.PI2_categories = ['D3', 'D4', 'A6', 'A5', 'D1', 'D2', 'A8', 'B2', 'E1',
'A1', 'A2', 'C1', 'C4', 'A7', 'C2', 'C3', 'A4', 'A3', 'B1']
def fit(self, X, y=None):
product_columns = ['Product_Info_1', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7'] + self.PI2_categories
product_idx = [col for col in range(X.shape[1]) if X.columns[col] in product_columns]
personal_columns = ['Ins_Age', 'Ht', 'Wt', 'BMI']
personal_idx = [col for col in range(X.shape[1]) if X.columns[col] in personal_columns]
medical_hist_columns = ["Medical_History_{}".format(x) for x in range(1, 42, 1)]
medical_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_hist_columns]
family_hist_columns = ["Family_Hist_{}".format(x) for x in range(1, 6, 1)]
family_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in family_hist_columns]
insured_info_columns = ["InsuredInfo_{}".format(x) for x in range(1, 8, 1)]
insured_info_idx = [col for col in range(X.shape[1]) if X.columns[col] in insured_info_columns]
insurance_hist_columns = ["Insurance_History_{}".format(x) for x in range(1, 10, 1)]
insurance_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in insurance_hist_columns]
employment_info_columns = ["Employment_Info_{}".format(x) for x in range(1, 7, 1)]
employment_info_idx = [col for col in range(X.shape[1]) if X.columns[col] in employment_info_columns]
medical_keyword_columns = ["Medical_Keyword_{}".format(x) for x in range(1, 49, 1)]
medical_keyword_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_keyword_columns]
medical_keyword_columns = ["Medical_Keyword_{}".format(x) for x in range(1, 49, 1)]
medical_keyword_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_keyword_columns]
get_original_features = lambda X: X
get_product_columns = lambda X: X[:, product_idx]
get_personal_columns = lambda X: X[:, personal_idx]
get_medical_hist_columns = lambda X: X[:, medical_hist_idx]
get_family_hist_columns = lambda X: X[:, family_hist_idx]
get_insured_info_columns = lambda X: X[:, insured_info_idx]
get_insurance_hist_columns = lambda X: X[:, insurance_hist_idx]
get_employment_info_columns = lambda X: X[:, employment_info_idx]
get_medical_keyword_columns = lambda X: X[:, medical_keyword_idx]
get_medical_and_family = lambda X: X[:, medical_keyword_idx + medical_hist_idx + family_hist_idx]
union = FeatureUnion([
("original_features", FunctionTransformer(get_original_features)),
("product_interaction", Pipeline([('select_product', FunctionTransformer(get_product_columns)),
('product_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("personal_interaction", Pipeline([('select_personal', FunctionTransformer(get_personal_columns)),
('personal_interaction', PolynomialFeatures(4, include_bias=False, interaction_only=True))
])),
("medical_hist_interaction", Pipeline([('select_medical', FunctionTransformer(get_medical_hist_columns)),
('medical_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("family_hist_interaction", Pipeline([('select_family_hist', FunctionTransformer(get_family_hist_columns)),
('family_hist_interaction', PolynomialFeatures(5, include_bias=False, interaction_only=True))
])),
("insured_info_interaction", Pipeline([('select_insured_info', FunctionTransformer(get_insured_info_columns)),
('insured_info_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("insurance_hist_interaction", Pipeline([('select_insurance_hist', FunctionTransformer(get_insurance_hist_columns)),
('insurance_hist_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("employment_info_interaction", Pipeline([('select_employment_info', FunctionTransformer(get_employment_info_columns)),
('employment_info_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("medical_keyword_interaction", Pipeline([('select_medical_keyword', FunctionTransformer(get_medical_keyword_columns)),
('medical_keyword_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
])
self.Xt = union.fit_transform(X)
return self
def transform(self, X, y=None):
return self.Xt