我在下面的链接中关注示例。
https://www.analyticsvidhya.com/blog/2015/09/build-predictive-model-10-minutes-python/
这是我的实际代码。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from tabulate import tabulate
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
# Load data
train = pd.read_csv('C:\\path_here\\train.csv')
test = pd.read_csv('C:\\path_here\\test.csv')
pd.set_option('display.max_columns', None)
train.shape
list(train)
test.shape
list(test)
df = pd.concat([train,test],axis=0) #Combined both Train and Test Data set
df.shape
df = train.select_dtypes(include=[np.number])
df.shape
list(df)
df.dtypes
features = [ 'block', 'zip', 'lot', 'schooldist', 'xcoord', 'ycoord']
df['target'] = 0
df.head(10)
#create label encoders for categorical features
for var in df:
number = LabelEncoder()
df[var] = number.fit_transform(df[var].astype('str'))
df['target__office'] = number.fit_transform(df['target'].astype('str'))
training, testing = np.split(df.sample(frac=1), [int(.8*len(df))])
print('Training set shape:', training.shape)
print('Testing set shape:', testing.shape)
x_train = training[list(features)].values
y_train = training['target'].values
x_validate = testing[list(features)].values
y_validate = testing['target'].values
x_test = test[list(features)].values
random.seed(100)
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)
# Check performance and make predictions
status = rf.predict_proba(x_validate)
fpr, tpr, _ = roc_curve(y_validate, status[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
final_status = rf.predict_proba(x_test)
test['target__office']=final_status[:,1]
此行发生错误:
fpr, tpr, _ = roc_curve(y_validate, status[:,1])
这是我看到的错误消息。
IndexError: index 1 is out of bounds for axis 1 with size 1
我的x_validate看起来像这样:
x_validate
Out[812]:
array([[479, 156, 24, ..., 91, 667, 44],
[417, 0, 10, ..., 41, 177, 388],
[810, 123, 14, ..., 447, 12, 78],
...,
[201, 136, 7, ..., 266, 58, 448],
[189, 0, 32, ..., 0, 961, 962],
[ 63, 133, 19, ..., 548, 399, 400]])
我的y_validate看起来像这样:
y_validate
Out[813]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0])
有什么想法吗?我想我只是不太了解这些东西,无法自己解决错误。谢谢!