当我尝试运行代码并且不知道如何解决它时,出现以下错误(参见图片)。将y设置为[train_index]时会引发错误。我的train_index类型为int64,我的y变量是一个包含整数的列表。从代码中可以看出,y列表是使用map函数更改的。
图片显示在这里: types of the variables TypeError
# Load xls sheet with data
doc = xlrd.open_workbook('bank-one_duration.xlsx').sheet_by_index(0)
# Extract attribute names (1st row, column 4 to 12)
attributeNames = doc.row_values(0, 0, 46)
# Extract class names to python list,
# then encode with integers (dict)
classLabels = doc.col_values(44, 1, 4521)
y = list(map(int,classLabels))
# Preallocate memory, then extract excel data to matrix X
X = np.empty((4520, 45))
for i, col_id in enumerate(range(0, 45)):
X[:, i] = np.asarray(doc.col_values(col_id, 1, 4521))
## Crossvalidation
# Create crossvalidation partition for evaluation
K = 5
CV = model_selection.KFold(n_splits=K,shuffle=True)
M = len(attributeNames)
# Initialize variables
Features = np.zeros((M,K))
Error_train = np.empty((K,1))
Error_test = np.empty((K,1))
Error_train_fs = np.empty((K,1))
Error_test_fs = np.empty((K,1))
Error_train_nofeatures = np.empty((K,1))
Error_test_nofeatures = np.empty((K,1))
k=0
for train_index, test_index in CV.split(X):
# extract training and test set for current CV fold
X_train = X[train_index,:]
y_train = y[train_index]
X_test = X[test_index,:]
y_test = y[test_index]
internal_cross_validation = 10
# Compute squared error without using the input data at all
Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]
# Compute squared error with all features selected (no feature selection)
m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]
# Compute squared error with feature subset selection
#textout = 'verbose';
textout = '';
selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation,display=textout)
Features[selected_features,k]=1
# .. alternatively you could use module sklearn.feature_selection
if len(selected_features) is 0:
print('No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' )
else:
m = lm.LinearRegression(fit_intercept=True).fit(X_train[:,selected_features], y_train)
Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]
figure(k)
subplot(1,2,1)
plot(range(1,len(loss_record)), loss_record[1:])
xlabel('Iteration')
ylabel('Squared error (crossvalidation)')
subplot(1,3,3)
bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:])
clim(-1.5,0)
xlabel('Iteration')
print('Cross validation fold {0}/{1}'.format(k+1,K))
print('Train indices: {0}'.format(train_index))
print('Test indices: {0}'.format(test_index))
print('Features no: {0}\n'.format(selected_features.size))
k+=1
# Inspect selected feature coefficients effect on the entire dataset and
# plot the fitted model residual error as function of each attribute to
# inspect for systematic structure in the residual
f=2 # cross-validation fold to inspect
ff=Features[:,f-1].nonzero()[0]
if len(ff) is 0:
print('\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' )
else:
m = lm.LinearRegression(fit_intercept=True).fit(X[:,ff], y)
y_est= m.predict(X[:,ff])
residual=y-y_est
figure(k+1, figsize=(12,6))
title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
for i in range(0,len(ff)):
subplot(2,np.ceil(len(ff)/2.0),i+1)
plot(X[:,ff[i]],residual,'.')
xlabel(attributeNames[ff[i]])
ylabel('residual error')
show()