我尝试使用xgboost来建模我的数据。一小部分数据附加为excel文件(text.xlsx)。该数据包含10个样本(每行代表一个样本),其具有17个特征(列A至列Q)。这些特征是离散的分类变量,由字母表示。列R表示样品的标签(分类)。
我使用以下python 2代码来使用xgboost:
# !/usr/bin/python
# -*- encoding:utf-8 -*-
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split # cross_validation
from sklearn.metrics import accuracy_score
def translate(x):
if x == 'A':
x = 1
elif x == 'C':
x = 2
elif x == 'D':
x = 3
elif x == 'E':
x = 4
elif x == 'F':
x = 5
elif x == 'G':
x = 6
elif x == 'H':
x = 7
elif x == 'I':
x = 8
elif x == 'K':
x = 9
elif x == 'L':
x = 10
elif x == 'M':
x = 11
elif x == 'N':
x = 12
elif x == 'P':
x = 13
elif x == 'Q':
x = 14
elif x == 'R':
x = 15
elif x == 'S':
x = 16
elif x == "T":
x = 17
elif x == 'V':
x = 18
elif x == 'W':
x = 19
elif x == 'Y':
x = 20
return x
def g_h(y_hat, y):
p = 1.0 / (1.0 + np.exp(-y_hat))
g = p - y.get_label()
h = p * (1.0-p)
return g, h
def error_rate(y_hat, y):
return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)
def plot_variable_importance(x, y):
tree = DecisionTreeClassifier( random_state = 99 )
tree.fit(x, y)
plot_model_var_imp(tree, x, y)
def plot_model_var_imp(model, x, y):
imp = pd.DataFrame(
model.feature_importances_ ,
columns = ['Importance'] ,
index = x.columns
)
if __name__ == "__main__":
data = pd.read_excel('test.xlsx',header = None)
x, y = data[range(17)], data[17]
x = pd.get_dummies(x)
x = x.applymap(translate)
for i in range(0,10):
for j in range(0,17):
x.iloc[i][j] = translate(x.iloc[i][j])
print "x = \n", x
print "y = \n", y
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.7)
# print "x_test = \n", x_test
# print "x_train = \n", x_train
# plot_variable_importance(x_train, y_train)
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
print data_train
watch_list = [(x_test, 'eval'), (x_train, 'train')]
param = {'max_depth': 4, 'eta': 0.3, 'silent': 1, 'objective': 'binary:logistic'}
bst = xgb.train(param, x_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(x_test)
print "y_hat = \n", y_hat
运行此代码会显示以下错误消息:
***** TypeError:无效的缓存项:DataFrame异常AttributeError: "'助力器'对象没有属性'句柄'"在> 忽略*****
有人可以告诉我应该如何修改代码才能使其正常工作?