Question

作为一个新手，我试图建立自己可理解的代码组件流，每个组件都将我带到预测的cols和接近1的相关性，即经过训练的数据和模型拟合。但是，我对matplotlib不允许我生成corr mats的事实感到非常沮丧，因为我无法跟踪数据中的列，而且我知道它在那里。我检查了。标头和数据。

查看代码。导入了所有相关模块（我认为）

import pandas as pd
import numpy as np

train = pd.read_csv("..._new.csv", encoding='cp1252', error_bad_lines=False) # , encoding='cp1252', error_bad_lines=False
test = pd.read_csv("..._test.csv", encoding='cp1252', error_bad_lines=False) # , encoding='cp1252', error_bad_lines=False
submission = pd.read_csv("C:\\Users\\jcst\\Desktop\\Private\\Python data\\train16.csv")

print(submission.head(20))

X = train.drop(["person_id", "DTUDur"],axis = 1)
y = train["DTUDur"]

print("\nSIZE OF DATATABLE")

print(X.head(10))

print("\nCOLUMNS IN DATA")

print(X.columns, X.shape)

# -------------------------------------------

# Actual data processing

print("\nCHECK MISSING DATA")

def check_missing_data(X):
    total = X.isnull().sum().sort_values(ascending = False)
    percent = ((X.isnull().sum()/X.isnull().count())*100).sort_values(ascending = False)
return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

check_missing_data(X).head()

print(check_missing_data(test).head(25))

# -------------------------------------------

# Unique values in the dataset

print("\nUNIQUE VALUES")

df_tmp=pd.DataFrame(X.nunique().sort_values(),columns=['num_unique_values']).reset_index().rename(columns={'index':'Column_name'})

print(df_tmp.head(10))

print("\nCOLUMNS WITH UNIQUE VALUES")

# Columns with unique values

def col_name_with_n_unique_value(X, n):
    df1=pd.DataFrame(X.nunique().sort_values(),columns=['num_unique_values']).reset_index()
col_name=list(df1[df1.num_unique_values==1]['index'])
print('number of columns with only', n, 'unique values are: ', len(col_name))
return col_name

col_to_drop=col_name_with_n_unique_value(X,1)

# -------------------------------------------

# correlation of the matrix

import matplotlib.pyplot as plt
import seaborn as sns

corrmat = X.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=.8, annot=True);

corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["DTUDur"])>0.1]
plt.rcParams.update({'font.size': 5})
plt.figure(figsize=(6,6))
g = sns.heatmap(train[top_corr_features].corr(), annot=True, cmap="RdYlGn")

这是我的回溯

Traceback (most recent call last):
  File "C:\Users\jcst\PycharmProjects\Frafaldsanalyse\venv\lib\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
    return self._engine.get_loc(key)
  File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'DTUDur'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:/Users/jcst/PycharmProjects/Frafaldsanalyse/DefiningCatAndNumFeatures_4_new.py", line 76, in <module>
    top_corr_features = corrmat.index[abs(corrmat["DTUDur"])>0.1]
  File "C:\Users\jcst\PycharmProjects\Frafaldsanalyse\venv\lib\site-packages\pandas\core\frame.py", line 2927, in __getitem__
    indexer = self.columns.get_loc(key)
  File "C:\Users\jcst\PycharmProjects\Frafaldsanalyse\venv\lib\site-packages\pandas\core\indexes\base.py", line 2659, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'DTUDur'

Matplotlib导致生成地图的问题

0 个答案: