我正在尝试预测“ Full_Time_Home_Goals
”列(功能)。
我遵循了Kaggle的示例。该代码可以像我的示例一样使用不同的尺寸(测试数据为419行,火车数据为892行)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
# Set option to display all the rows and columns in the dataset. If there are more rows, adjust number accordingly.
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Files
data_train = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\train.csv")
data_test = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\test.csv")
columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]
data_train = data_train.dropna()
data_test = data_test.dropna()
data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)
from sklearn import preprocessing
def encode_features(df_train, df_test):
features = ['HomeTeam', 'AwayTeam']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
le = preprocessing.LabelEncoder()
le = le.fit(df_combined[feature])
df_train[feature] = le.transform(df_train[feature])
df_test[feature] = le.transform(df_test[feature])
return df_train, df_test
data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())
# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict
X_all = data_train
y_all = data_train['Full_Time_Home_Goals']
from sklearn.model_selection import train_test_split
num_test = 0.20 # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
# Using Random Forest and using parameters that we defined
clf = RandomForestClassifier()
parameters = {'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt', 'auto'],
'criterion': ['entropy', 'gini'],
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 5, 8]
}
acc_scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
我得到的错误是:
代码如下:
回溯(最近通话最近): 文件“ C:/ Users / harsh / PycharmProjects / Kaggle-Machine Learning从头到尾使用Scikit-Learn / EPL Predicting.py”,第98行,在 预测= clf.predict(data_test.drop('Id',axis = 1)) 文件“ C:\ Users \ harsh \ PycharmProjects \ GitHub \ venv \ lib \ site-packages \ sklearn \ ensemble_forest.py”, 预测中的第629行 ValueError:模型的特征数量必须与输入匹配。模型n_features为4,输入n_features为2
代码从predictions = clf.predict(data_test.drop('Id', axis=1))
to predictions = clf.predict(X_test)
更改后,错误是:
raise ValueError(msg) ValueError: array length 37921 does not match index length 380
如何解决此问题?
答案 0 :(得分:0)
以下经过测试且可以正常使用的代码:
data_train = pd.read_csv(r"train.csv")
data_test = pd.read_csv(r"test.csv")
columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]
data_train = data_train.dropna()
data_test = data_test.dropna()
data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)
from sklearn import preprocessing
def encode_features(df_train, df_test):
features = ['HomeTeam', 'AwayTeam']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
le = preprocessing.LabelEncoder()
le = le.fit(df_combined[feature])
df_train[feature] = le.transform(df_train[feature])
df_test[feature] = le.transform(df_test[feature])
return df_train, df_test
data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())
# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict
y_all = data_train['Full_Time_Home_Goals']
X_all = data_train.drop(['Full_Time_Home_Goals'], axis=1)
from sklearn.model_selection import train_test_split
num_test = 0.20 # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
# Using Random Forest and using parameters that we defined
clf = RandomForestClassifier()
parameters = {'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt', 'auto'],
'criterion': ['entropy', 'gini'],
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 5, 8]
}
acc_scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))
ids = data_test['Id']
predictions = clf.predict(data_test)
df_preds = pd.DataFrame({"id":ids, "predictions":predictions})
df_preds
Id HomeTeam AwayTeam Full_Time_Home_Goals
0 1 55 440 3
1 2 158 493 2
2 3 178 745 1
3 4 185 410 1
4 5 249 57 2
Id HomeTeam AwayTeam
0 190748 284 54
1 190749 124 441
2 190750 446 57
3 190751 185 637
4 190752 749 482
0.33213786556261704
id predictions
0 190748 1
1 190749 1
2 190750 1
3 190751 1
4 190752 1
... ... ...
375 191123 1
376 191124 1
377 191125 1
378 191126 1
379 191127 1
380 rows × 2 columns