我编写了以下脚本,以找出要在我的sklearn算法中使用的最佳功能。
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plots
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
data = pd.read_csv("movies.csv")
X = data.iloc[6:] #columns with words
y = genre_to_binary(data.iloc[1]) #target column i.e genre
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Genre'] #naming the dataframe columns
print(featureScores.nlargest(10,'Genre')) #print 10 best features
data = pd.read_csv("movies.csv")
X = data.iloc[:,0:20] #independent columns
y = data.iloc[:,-1] #target column i.e price range
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()
def genre_to_binary(series):
genres_list = series.values.tolist()
converted_genre=[]
for i in np.arange(len(genres_list)):
if genres_list[i]=="action":
converted_genre=np.append(converted_genre,0)
else:
converted_genre=np.append(converted_genre,1)
return converted_genre
但是,这会引发以下错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-78-b772929399c8> in <module>
1 #apply SelectKBest class to extract top 10 best features
2 bestfeatures = SelectKBest(score_func=chi2, k=10)
----> 3 fit = bestfeatures.fit(X,y)
4 dfscores = pd.DataFrame(fit.scores_)
5 dfcolumns = pd.DataFrame(X.columns)
/srv/app/venv/lib/python3.6/site-packages/sklearn/feature_selection/univariate_selection.py in fit(self, X, y)
339 self : object
340 """
--> 341 X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True)
342
343 if not callable(self.score_func):
/srv/app/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
717 ensure_min_features=ensure_min_features,
718 warn_on_dtype=warn_on_dtype,
--> 719 estimator=estimator)
720 if multi_output:
721 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
/srv/app/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
534 # make sure we actually converted to numeric:
535 if dtype_numeric and array.dtype.kind == "O":
--> 536 array = array.astype(np.float64)
537 if not allow_nd and array.ndim >= 3:
538 raise ValueError("Found array with dim %d. %s expected <= 2."
ValueError: could not convert string to float: 'natural born killers'
我不明白为什么天生的杀手甚至包含在x或y中,因为它只出现在我无法访问的索引0的列中。那么问题出在哪里呢?
您可以找到文件here。