我的代码如下所示。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
from subprocess import check_output
import os
import time
### Loading the Data
start_time = time.time() # track time
train = pd.read_csv('C:\\Users\\my_path_here\\train.csv')
print("%s seconds" % (time.time() - start_time))
start_time = time.time() # track time
test = pd.read_csv('C:\\Users\\my_path_here\\test.csv')
print("%s seconds" % (time.time() - start_time))
# Start to explore the data sets
print(train.shape)
print(test.shape)
# show all column names in a data frame
pd.set_option('display.max_columns', None)
train.head()
test.head()
# Check null value
print(train.isnull().sum())
# Check zero value
print((train == 0).astype(int).sum(axis=0))
# Check for any duplicates
train.drop_duplicates()
# Check null value
print(test.isnull().sum())
# Check zero value
print((test == 0).astype(int).sum(axis=0))
# Check for any duplicates
test.drop_duplicates()
# get list of data types; understand what we're looking at
print(train.dtypes)
print(test.dtypes)
# change data types from object to datetime
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
# Start to visualize the data sets
train['week'] = train.pickup_datetime.dt.week
train['weekday'] = train.pickup_datetime.dt.weekday
train['hour'] = train.pickup_datetime.dt.hour
test['week'] = test.pickup_datetime.dt.week
test['weekday'] = test.pickup_datetime.dt.weekday
test['hour'] = test.pickup_datetime.dt.hour
# find and plot taxi passenger counts
pass_count = train['passenger_count']
print("Maximum number of passengers on a trip : ", np.max(pass_count.values))
print("Minimum number of passengers on a trip : ", np.min(pass_count.values))
print("Average number of passengers on a trip : ", np.mean(pass_count.values))
f = plt.figure(figsize=(10,5))
pass_count = train['passenger_count'].value_counts()
sns.barplot(pass_count.index, pass_count.values, alpha=0.7)
plt.xlabel('Number of passengers on a trip', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.show()
f = plt.figure(figsize=(15,5))
sns.countplot(x='week', data=train)
plt.xlabel('Day of month', fontsize=14)
plt.ylabel('Pickup count', fontsize=14)
plt.show()
f = plt.figure(figsize=(15,5))
days = [i for i in range(7)]
sns.countplot(x='weekday', data=train)
plt.xlabel('Day of the week', fontsize=14)
plt.ylabel('Pickup count', fontsize=14)
plt.xticks(days, ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))
plt.show()
f = plt.figure(figsize=(15,5))
sns.countplot(x='hour', data=train)
plt.xlabel('Hour', fontsize=14)
plt.ylabel('Pickup count', fontsize=14)
plt.show()
# drop a few columns/features...we need all strings and numerics for our feature engineering steps
train.drop(['id'], axis=1, inplace=True)
train.drop(['store_and_fwd_flag'], axis=1, inplace=True)
train.drop(['pickup_datetime'], axis=1, inplace=True)
train.drop(['dropoff_datetime'], axis=1, inplace=True)
test.drop(['key'], axis=1, inplace=True)
#test.drop(['store_and_fwd_flag'], axis=1, inplace=True)
test.drop(['pickup_datetime'], axis=1, inplace=True)
print(train.dtypes)
print(test.dtypes)
# Start features engineering process
# set X as the independent variable and y as the dependent variable
# we will drop 'trip_duration' from the training dataset;
# other factors can influence our ability to make predictions
y = train['trip_duration']
train.drop(['trip_duration'], axis=1, inplace=True)
X = train
X.shape, y.shape
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=42)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape
# not working...
from sklearn.ensemble import RandomForestRegressor
m1 = RandomForestRegressor(n_estimators=19, min_samples_split=2, min_samples_leaf=4, max_features='auto', max_depth=80, bootstrap=True)
m1.fit(X_train, y_train)
m1.score(X_valid, y_valid)
test_columns = X_train.columns
predictions = m1.predict(test[test_columns])
my_submission = pd.DataFrame({'id': test.id, 'trip_duration': predictions})
my_submission.head()
my_submission.to_csv('C:\\Users\\my_path_here\\submit_file.csv', index=False)
一切正常,直到我到达这一行:
predictions = m1.predict(test[test_columns])
到达该行后,我会看到此错误。
KeyError: "['vendor_id'] not in index"
这很奇怪,因为当我检查'test_columns'时,我看到以下内容:
Index(['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude',
'dropoff_longitude', 'dropoff_latitude', 'week', 'weekday', 'hour'],
dtype='object')
您知道我的设置有什么问题吗?我该如何解决?
示例代码来自here。
答案 0 :(得分:1)
vendor_id
数据帧本身中似乎没有test
。
尝试使用
查看test
中存在的列列表
test.columns