这是我的代码。
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# First, let’s load the data:
# read the data
train = pd.read_csv('C:\\Users\\Excel\\Desktop\\train.csv')
# checking the percentage of missing values in each variable
train.isnull().sum()/len(train)*100
n = len(train.columns)
print(n)
a = train.isnull().sum()/len(train)*100
variables = train.columns
variable = [ ]
for i in range(0,n):
if a[i]<=20: #setting the threshold as 20%
variable.append(variables[i])
# So the variables to be used are stored in “variable”, which contains only
# those features where the missing values are less than 20%.
train = train.apply(pd.to_numeric, errors='coerce')
train = train.fillna(0)
# Total missing:
allzeros = train.isnull().sum().sum()
print(allzeros)
# Check for any duplicates
train.drop_duplicates()
# display all columns in data frame
pd.set_option('display.max_columns', None)
print(train)
# get data types of each field in data frame
train.dtypes
# Count data types in pandas dataframe
train.dtypes.value_counts()
print(train.var())
print(np.var(train))
threshold = 0.2
train = train.drop(train.std()[train.std() < threshold].index.values, axis=1)
train.corr()
train.corr(method ='pearson')
from sklearn.ensemble import RandomForestRegressor
train = train.drop(['officearea'], axis=1)
model = RandomForestRegressor(random_state=1, max_depth=10)
train = pd.get_dummies(train)
model.fit(train,train.retailarea)
n = len(train.columns)
print(n)
# After fitting the model, plot the feature importance graph:
features = train.columns
importances = train.retailarea
indices = np.argsort(importances)[-20:] # top n features
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
将生成此图。
我正在尝试在y轴上列出所有功能,而不是数字。有人可以告诉我我在这里想念的吗?该代码来自下面的链接。
https://www.analyticsvidhya.com/blog/2018/08/dimensionality-reduction-techniques-python/