Question

我的代码如下所示。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

from subprocess import check_output
import os
import time

### Loading the Data
start_time = time.time() # track time
train = pd.read_csv('C:\\Users\\my_path_here\\train.csv')
print("%s seconds" % (time.time() - start_time))

start_time = time.time() # track time
test = pd.read_csv('C:\\Users\\my_path_here\\test.csv')
print("%s seconds" % (time.time() - start_time))


# Start to explore the data sets
print(train.shape)
print(test.shape)


# show all column names in a data frame
pd.set_option('display.max_columns', None)
train.head()
test.head()


# Check null value
print(train.isnull().sum())
# Check zero value
print((train == 0).astype(int).sum(axis=0))
# Check for any duplicates
train.drop_duplicates()
# Check null value
print(test.isnull().sum())
# Check zero value
print((test == 0).astype(int).sum(axis=0))
# Check for any duplicates
test.drop_duplicates()



# get list of data types; understand what we're looking at
print(train.dtypes)
print(test.dtypes)


# change data types from object to datetime
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])


# Start to visualize the data sets
train['week'] = train.pickup_datetime.dt.week
train['weekday'] = train.pickup_datetime.dt.weekday
train['hour'] = train.pickup_datetime.dt.hour
test['week'] = test.pickup_datetime.dt.week
test['weekday'] = test.pickup_datetime.dt.weekday
test['hour'] = test.pickup_datetime.dt.hour


# find and plot taxi passenger counts
pass_count = train['passenger_count']
print("Maximum number of passengers on a trip : ", np.max(pass_count.values))
print("Minimum number of passengers on a trip : ", np.min(pass_count.values))
print("Average number of passengers on a trip : ", np.mean(pass_count.values))


f = plt.figure(figsize=(10,5))
pass_count = train['passenger_count'].value_counts()
sns.barplot(pass_count.index, pass_count.values, alpha=0.7)
plt.xlabel('Number of passengers on a trip', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.show()


f = plt.figure(figsize=(15,5))
sns.countplot(x='week', data=train)
plt.xlabel('Day of month', fontsize=14)
plt.ylabel('Pickup count', fontsize=14)
plt.show()


f = plt.figure(figsize=(15,5))
days = [i for i in range(7)]
sns.countplot(x='weekday', data=train)
plt.xlabel('Day of the week', fontsize=14)
plt.ylabel('Pickup count', fontsize=14)
plt.xticks(days, ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
plt.show()


f = plt.figure(figsize=(15,5))
sns.countplot(x='hour', data=train)
plt.xlabel('Hour', fontsize=14)
plt.ylabel('Pickup count', fontsize=14)
plt.show()


# drop a few columns/features...we need all strings and numerics for our feature engineering steps
train.drop(['id'], axis=1, inplace=True)
train.drop(['store_and_fwd_flag'], axis=1, inplace=True)
train.drop(['pickup_datetime'], axis=1, inplace=True)
train.drop(['dropoff_datetime'], axis=1, inplace=True)
test.drop(['key'], axis=1, inplace=True)
#test.drop(['store_and_fwd_flag'], axis=1, inplace=True)
test.drop(['pickup_datetime'], axis=1, inplace=True)


print(train.dtypes)
print(test.dtypes)



# Start features engineering process
# set X as the independent variable and y as the dependent variable
# we will drop 'trip_duration' from the training dataset; 
# other factors can influence our ability to make predictions
y = train['trip_duration']
train.drop(['trip_duration'], axis=1, inplace=True)
X = train
X.shape, y.shape


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape


X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=42)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape


# not working...
from sklearn.ensemble import RandomForestRegressor

m1 = RandomForestRegressor(n_estimators=19, min_samples_split=2, min_samples_leaf=4, max_features='auto', max_depth=80, bootstrap=True)
m1.fit(X_train, y_train)
m1.score(X_valid, y_valid)

test_columns = X_train.columns
predictions = m1.predict(test[test_columns])

my_submission = pd.DataFrame({'id': test.id, 'trip_duration': predictions})
my_submission.head()


my_submission.to_csv('C:\\Users\\my_path_here\\submit_file.csv', index=False)

一切正常，直到我到达这一行：

predictions = m1.predict(test[test_columns])

到达该行后，我会看到此错误。

KeyError: "['vendor_id'] not in index"

这很奇怪，因为当我检查'test_columns'时，我看到以下内容：

Index(['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'week', 'weekday', 'hour'],
      dtype='object')

您知道我的设置有什么问题吗？我该如何解决？

示例代码来自here。

Answer 1

vendor_id数据帧本身中似乎没有test。

尝试使用

查看test中存在的列列表

test.columns

使用RandomForestRegressor获取错误

1 个答案: