import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn import metrics, model_selection
from xgboost.sklearn import XGBClassifier
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import ensemble
warnings.filterwarnings('ignore')
train = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/train.csv (6)/train.csv')
test = pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test.csv (2)/test.csv')
test_labels=pd.read_csv('E:/vishnu vardhan/EM19424AVN1007WRPYTH/data/data/test_labels.csv/test_labels.csv')
print("\nTrain Data")
print("==========\n",train)
print("\nTest Data")
print("==========\n",test)
print("\nTest_labels Data")
print("================\n",test_labels)
sns.barplot(x='toxic', y='identity_hate', data=train);
plt.show()
print("\n\nTrain data shape:",train.shape)
print("\nTest data shape:",test.shape)
print("\nTestLabels data shape:",test_labels.shape)
print("\nCorrelation matrix")
print("==================")
plt.title('Correlation Matrix')
sns.heatmap(train.corr())
plt.show()
print("\n Data Descriptive")
print("================\n",train.describe())
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
max_depth = 5, alpha = 10, n_estimators = 10)
print("\nRegressor")
print("===========\n",xg_reg)
X = test_labels.iloc[:,1:6].values
Y = test_labels.iloc[:,6].values
#print("X value\n",X,"\n\nY value \n",Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(X_train, y_train)
print("\n Classifier")
print("============\n",model)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
params = {
'n_estimators': 1,
'max_depth': 1,
'learning_rate': 1,
'criterion': 'mse'
}
gradient_boosting_regressor = ensemble.GradientBoostingRegressor(**params)
gradient_boosting_regressor.fit(X, Y)
plt.figure(figsize=(10, 5))
plt.title('Gradient Boosting model (1 estimators, Single tree split)')
plt.scatter(X, Y)
plt.plot(X, gradient_boosting_regressor.predict(X), color='r')
plt.show()
在执行上述代码时,会发生此错误。
“ raise ValueError(” x和y的大小必须相同“)”
我有.csv
文件,具有1398行和2列。我已将40%作为y_test设置,因为在上面的代码中可以看到它。