与“ Matplotlib导致生成地图时出现问题”相同的问题 我正在尝试建立自己可理解的代码组件流,每个组件都将我带到接近1的预测列和相关性,即训练有素的数据和模型拟合。但是,我对matplotlib不允许我生成更正垫这一事实感到非常沮丧,因为我无法跟踪数据中的列,而且我知道它在那里。我检查了。标头和数据。
查看代码。导入了所有相关模块(我认为)
import pandas as pd
import numpy as np
train = pd.read_csv("..._new.csv", encoding='cp1252', error_bad_lines=False) # , encoding='cp1252', error_bad_lines=False
test = pd.read_csv("..._test.csv", encoding='cp1252', error_bad_lines=False) # , encoding='cp1252', error_bad_lines=False
submission = pd.read_csv("C:\\Users\\jcst\\Desktop\\Private\\Python data\\train16.csv")
print(submission.head(20))
X = train.drop(["person_id", "DTUDur"],axis = 1)
y = train["DTUDur"]
print("\nSIZE OF DATATABLE")
print(X.head(10))
print("\nCOLUMNS IN DATA")
print(X.columns, X.shape)
# -------------------------------------------
# Actual data processing
print("\nCHECK MISSING DATA")
def check_missing_data(X):
total = X.isnull().sum().sort_values(ascending = False)
percent = ((X.isnull().sum()/X.isnull().count())*100).sort_values(ascending = False)
返回pd.concat([total,percent],axis = 1,keys = ['Total','Percent'])
check_missing_data(X).head()
print(check_missing_data(test).head(25))
# -------------------------------------------
# Unique values in the dataset
print("\nUNIQUE VALUES")
df_tmp=pd.DataFrame(X.nunique().sort_values(),columns=['num_unique_values']).reset_index().rename(columns={'index':'Column_name'})
print(df_tmp.head(10))
print("\nCOLUMNS WITH UNIQUE VALUES")
# Columns with unique values
def col_name_with_n_unique_value(X, n):
df1=pd.DataFrame(X.nunique().sort_values(),columns=['num_unique_values']).reset_index()
col_name = list(df1 [df1.num_unique_values == 1] ['index']) print('仅列数',n,'唯一值是:',len(col_name)) 返回col_name
col_to_drop=col_name_with_n_unique_value(X,1)
# -------------------------------------------
# correlation of the matrix
# correlation of the matrix
import matplotlib.pyplot as plt
import seaborn as sns
corrmat = X.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=.8, annot=True);
# -------------------------------------------
# most correlated features
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["DTUDur"])>0.1]
plt.rcParams.update({'font.size': 5})
plt.figure(figsize=(6,6))
g = sns.heatmap(train[top_corr_features].corr(), annot=True, cmap="RdYlGn")
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["DTUDurYr"])>0.7]
plt.figure(figsize=(10,10))
g = sns.barplot(train., train.DTUDurYr)
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["DTUDurYr"])>0.7]
plt.figure(figsize=(10,10))
g = sns.barplot(train.DTUDurYr, train.GennemsnitBeståetAndetSemesterECTS)
plt.rcParams.update({'font.size': 5})
plt.show()
# -------------------------------------------
# divide the data set into categorial and non categorial features and apply models to get the insight of the data
print(“ \ n定义化学和数值特征”)
categorical_features = X.select_dtypes(include = ['object'])。列
print(categorical_features)
numerical_features = X.select_dtypes(排除= [“ object”])。列
打印(数字功能)
print(“ \ n将数据集划分为分类和非分类特征,并应用模型来获取数据的洞察力”)
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
print("\nFILLING THE MISSING VALUE OF TEST WITH THEIR MEAN VALUE, FOR BETTER ACCURACY")
test = test.select_dtypes(exclude=[np.object])
test.info()
test = test.fillna(test.mean(), inplace=True)
print("\nAPPLYING MODEL RANDOM FOREST REGRESSOR")
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
# pull data into target (y) and predictors (X)
predictor_cols = ['F18 ECTS på kurser med beståede talkarakter']
# -------------------------------------------
# Create training predictors data
train_X = X[predictor_cols]
my_model = RandomForestRegressor()
my_model.fit(train_X, y)
my_model.score(train_X, y)
print(predictor_cols)
print(my_model.score(train_X, y))
test = pd.read_csv("M:\\20190214_Datasæt_new_test.csv")
# -------------------------------------------
print("\nPRINT PREDICTED FACTORS")
test_X = test[predictor_cols]
# model to make predictions
predicted_factor = my_model.predict(test_X)
# at the predicted prices to ensure something sensible.
print(predicted_factor)
np.savetxt("M:\\train22.csv", predicted_factor, delimiter=';')
回溯(最近通话最近): 文件“ C:/Users/jcst/PycharmProjects/Frafaldsanalyse/DefiningCatAndNumFeatures_4_new.py”在第142行中 my_model.fit(train_X,y) 文件“ C:\ Users \ jcst \ PycharmProjects \ Frafaldsanalyse \ venv \ lib \ site-packages \ sklearn \ ensemble \ forest.py”,行250,适合 X = check_array(X,accept_sparse =“ csc”,dtype = DTYPE) 文件“ C:\ Users \ jcst \ PycharmProjects \ Frafaldsanalyse \ venv \ lib \ site-packages \ sklearn \ utils \ validation.py”,行573,在check_array中 allow_nan = force_all_finite =='allow-nan') 文件“ C:\ Users \ jcst \ PycharmProjects \ Frafaldsanalyse \ venv \ lib \ site-packages \ sklearn \ utils \ validation.py”,第56行,_assert_all_finite 引发ValueError(msg_err.format(type_err,X.dtype)) ValueError:输入包含NaN,无穷大或对于dtype('float32')而言太大的值