我无法在RandomForestClassifier中学习数据。我试图做Kaggle的泰坦尼克号挑战(https://www.kaggle.com/c/titanic/data) 现在我想把数据放在RandomForest中,我写了代码
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(xs,y)
test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
test_df.replace("male",0).replace("female",1)
test_df["Age"].fillna(df.Age.median(), inplace=True)
test_df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
test_df2 = test_df.drop(["Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)
发生错误
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-5cff55688b7b> in <module>()
1 forest = RandomForestClassifier(n_estimators = 100)
----> 2 forest = forest.fit(xs,y)
3 test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
4 # test_df.Sex = df.Sex.replace(mapping)
5 test_df.replace("male",0).replace("female",1)
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
245 """
246 # Validate or convert input data
--> 247 X = check_array(X, accept_sparse="csc", dtype=DTYPE)
248 y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
249 if issparse(X):
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
ValueError: could not convert string to float: 'male'
我尝试了另一种方式
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(xs,y)
test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
test_df.Sex = df.Sex.replace(mapping)
# test_df.replace("male",0).replace("female",1)
test_df["Age"].fillna(df.Age.median(), inplace=True)
test_df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
test_df2 = test_df.drop(["Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)
此代码也会发生错误
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-10-6963525c0470> in <module>()
2 forest = forest.fit(xs,y)
3 test_df = pd.read_csv("Desktop/data/test.csv", delimiter=',')
----> 4 test_df.Sex = df.Sex.replace(mapping)
5 # test_df.replace("male",0).replace("female",1)
6 test_df["Age"].fillna(df.Age.median(), inplace=True)
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/generic.py in replace(self, to_replace, value, inplace, limit, regex, method, axis)
3834
3835 return self.replace(to_replace, value, inplace=inplace,
-> 3836 limit=limit, regex=regex)
3837 else:
3838
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/generic.py in replace(self, to_replace, value, inplace, limit, regex, method, axis)
3883 dest_list=value,
3884 inplace=inplace,
-> 3885 regex=regex)
3886
3887 else: # [NA, ''] -> 0
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in replace_list(self, src_list, dest_list, inplace, regex, mgr)
3257 return block, val
3258
-> 3259 masks = [comp(s) for i, s in enumerate(src_list)]
3260
3261 result_blocks = []
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in <listcomp>(.0)
3257 return block, val
3258
-> 3259 masks = [comp(s) for i, s in enumerate(src_list)]
3260
3261 result_blocks = []
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in comp(s)
3245 if isnull(s):
3246 return isnull(values)
-> 3247 return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq)
3248
3249 def _cast_scalar(block, scalar):
/Users/XXX/anaconda/envs/py36/lib/python3.6/site-packages/pandas/core/internals.py in _maybe_compare(a, b, op)
4617 type_names[1] = 'ndarray(dtype=%s)' % b.dtype
4618
-> 4619 raise TypeError("Cannot compare types %r and %r" % tuple(type_names))
4620 return result
4621
TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'
In [ ]:
我真的无法理解为什么第一个代码字符串无法转换为float,在下一个代码中ndarray(dtype = int64)无法转换为str。我可以解决这个问题吗?所有代码都像
# coding: utf-8
# In[1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
get_ipython().magic('matplotlib inline')
# In[2]:
df = pd.read_csv("Desktop/data/train.csv", delimiter=',')
print(df.head())
print(df.columns)
# In[3]:
mapping = {'male' : 0, 'female' : 1}
df.Sex = df.Sex.replace(mapping)
print(df.Sex)
df.replace("male",0).replace("female",1)
# In[4]:
df["Age"].fillna(df.Age.median(),inplace=True)
# In[5]:
split_data = []
for survived in [0,1]:
split_data.append(df[df.Survived==survived])
temp = [i["Pclass"].dropna() for i in split_data]
plt.hist(temp,histtype="barstacked",bins=3)
# In[6]:
temp = [i["Age"].dropna() for i in split_data]
plt.hist(temp, histtype="barstacked", bins=16)
# In[7]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df2 = df.drop(["Name", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)
# In[8]:
df2.head(10)
# In[9]:
train_data = df2.values
xs = train_data[:, 2:] # Pclass以降の変数
y = train_data[:, 1] # 正解データ