无法在RandomForestClassifier

时间:2017-08-27 01:38:10

标签: python pandas

我无法在RandomForestClassifier中学习数据。我试图做Kaggle的泰坦尼克号挑战(https://www.kaggle.com/c/titanic/data) 现在我想把数据放在RandomForest中,我写了代码

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(xs,y)
test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
test_df.replace("male",0).replace("female",1)
test_df["Age"].fillna(df.Age.median(), inplace=True)
test_df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
test_df2 = test_df.drop(["Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)

发生错误

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-5cff55688b7b> in <module>()
      1 forest = RandomForestClassifier(n_estimators = 100)
----> 2 forest = forest.fit(xs,y)
      3 test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
      4 # test_df.Sex = df.Sex.replace(mapping)
      5 test_df.replace("male",0).replace("female",1)

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if issparse(X):

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: could not convert string to float: 'male'

我尝试了另一种方式

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(xs,y)
test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
test_df.Sex = df.Sex.replace(mapping)
# test_df.replace("male",0).replace("female",1)
test_df["Age"].fillna(df.Age.median(), inplace=True)
test_df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
test_df2 = test_df.drop(["Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)

此代码也会发生错误

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-6963525c0470> in <module>()
      2 forest = forest.fit(xs,y)
      3 test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
----> 4 test_df.Sex = df.Sex.replace(mapping)
      5 # test_df.replace("male",0).replace("female",1)
      6 test_df["Age"].fillna(df.Age.median(), inplace=True)

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/generic.py in replace(self, to_replace, value, inplace, limit, regex, method, axis)
   3834 
   3835             return self.replace(to_replace, value, inplace=inplace,
-> 3836                                 limit=limit, regex=regex)
   3837         else:
   3838 

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/generic.py in replace(self, to_replace, value, inplace, limit, regex, method, axis)
   3883                                                        dest_list=value,
   3884                                                        inplace=inplace,
-> 3885                                                        regex=regex)
   3886 
   3887                 else:  # [NA, ''] -> 0

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in replace_list(self, src_list, dest_list, inplace, regex, mgr)
   3257             return block, val
   3258 
-> 3259         masks = [comp(s) for i, s in enumerate(src_list)]
   3260 
   3261         result_blocks = []

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in <listcomp>(.0)
   3257             return block, val
   3258 
-> 3259         masks = [comp(s) for i, s in enumerate(src_list)]
   3260 
   3261         result_blocks = []

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in comp(s)
   3245             if isnull(s):
   3246                 return isnull(values)
-> 3247             return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq)
   3248 
   3249         def _cast_scalar(block, scalar):

/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in _maybe_compare(a, b, op)
   4617             type_names[1] = 'ndarray(dtype=%s)' % b.dtype
   4618 
-> 4619         raise TypeError("Cannot compare types %r and %r" % tuple(type_names))
   4620     return result
   4621 

TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'

In [ ]:

我真的无法理解为什么第一个代码字符串无法转换为float,在下一个代码中ndarray(dtype = int64)无法转换为str。我可以解决这个问题吗?所有代码都像

# coding: utf-8

# In[1]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
get_ipython().magic('matplotlib inline')


# In[2]:

df = pd.read_csv("Desktop/data/train.csv", delimiter=',')
print(df.head())
print(df.columns)


# In[3]:

mapping = {'male' : 0, 'female' : 1}
df.Sex = df.Sex.replace(mapping)
print(df.Sex)
df.replace("male",0).replace("female",1)


# In[4]:

df["Age"].fillna(df.Age.median(),inplace=True)


# In[5]:

split_data = []
for survived in [0,1]:
    split_data.append(df[df.Survived==survived])
temp = [i["Pclass"].dropna() for i in split_data]
plt.hist(temp,histtype="barstacked",bins=3)


# In[6]:

temp = [i["Age"].dropna() for i in split_data]
plt.hist(temp, histtype="barstacked", bins=16)


# In[7]:

df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df2 = df.drop(["Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)


# In[8]:

df2.head(10)


# In[9]:

train_data = df2.values
xs = train_data[:, 2:] # Pclass以降の変数
y  = train_data[:, 1]  # 正解データ

0 个答案:

没有答案