我有一个数据框,其数据类似于下面的示例数据。我正在尝试使用get_dummies为categories字段中的值创建虚拟变量,但是当我运行下面的代码时,我得到以下错误。我想要的是例如第一条记录,有一列名为“Ramen”,其中一列为1,另一列称为“日语”,其中包含1。
示例数据:
user_id business_id stars_x \
1 CxDOIDnH8gp9KXzpBHJYXw XSiqtcVEsP6dLOL7ZA9OxA 4
2 CxDOIDnH8gp9KXzpBHJYXw v95ot_TNwTk1iJ5n56dR0g 3
3 CxDOIDnH8gp9KXzpBHJYXw uloYxyRAMesZzI99mfNInA 2
4 CxDOIDnH8gp9KXzpBHJYXw gtcsOodbmk4E0TulYHnlHA 4
address attributes \
1 522 Yonge Street {u'BusinessParking': {u'garage': False, u'stre...
2 1661 Denison Street {u'BusinessParking': {u'garage': False, u'stre...
3 4101 Rutherford Road {u'BusinessParking': {u'garage': False, u'stre...
4 815 W Bloor Street {u'Alcohol': u'full_bar', u'HasTV': False, u'N...
categories city \
1 [Restaurants, Ramen, Japanese] Toronto
2 [Chinese, Seafood, Restaurants] Markham
3 [Italian, Restaurants] Woodbridge
4 [Food, Coffee & Tea, Sandwiches, Cafes, Cockta... Toronto
hours is_open latitude \
1 {u'Monday': u'11:00-22:00', u'Tuesday': u'11:0... 1 43.663689
2 {} 0 43.834295
3 {u'Monday': u'12:00-22:00', u'Tuesday': u'12:0... 1 43.823486
4 {u'Monday': u'12:00-2:00', u'Tuesday': u'12:00... 1 43.662726
longitude name neighborhood postal_code \
1 -79.384200 Kenzo Ramen Downtown Core M4Y 1X9
2 -79.305282 Vince Seafood Restaurant & BBQ Milliken L3R 6E4
3 -79.568345 Motorino Enoteca Pine Grove L4L 1A5
4 -79.422167 Northwood Bickford Park M6G 1M1
review_count stars_y state good_reviews
1 76 3.5 ON True
2 23 3.5 ON False
3 26 3.5 ON False
4 93 4.0 ON True
代码:
pd.get_dummies(bus_rev['categories'])
错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-23-e57eccbfbe12> in <module>()
----> 1 bus_rev_cat2 = pd.get_dummies(bus_rev['categories'])
2 #bus_revlist = pd.concat([bus_rev,bus_rev_cat2],axis=1)
3 #bus_revlist.head()
/Users/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in get_dummies(data, prefix, prefix_sep, dummy_na, columns, sparse, drop_first)
1102 else:
1103 result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
-> 1104 sparse=sparse, drop_first=drop_first)
1105 return result
1106
/Users/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse, drop_first)
1109 sparse=False, drop_first=False):
1110 # Series avoids inconsistent NaN handling
-> 1111 codes, levels = _factorize_from_iterable(Series(data))
1112
1113 def get_empty_Frame(data, sparse):
/Users/anaconda/lib/python2.7/site-packages/pandas/core/categorical.pyc in _factorize_from_iterable(values)
2038 codes = values.codes
2039 else:
-> 2040 cat = Categorical(values, ordered=True)
2041 categories = cat.categories
2042 codes = cat.codes
/Users/anaconda/lib/python2.7/site-packages/pandas/core/categorical.pyc in __init__(self, values, categories, ordered, name, fastpath)
288 codes, categories = factorize(values, sort=True)
289 except TypeError:
--> 290 codes, categories = factorize(values, sort=False)
291 if ordered:
292 # raise, as we don't have a sortable data structure and so
/Users/anaconda/lib/python2.7/site-packages/pandas/core/algorithms.pyc in factorize(values, sort, order, na_sentinel, size_hint)
311 table = hash_klass(size_hint or len(vals))
312 uniques = vec_klass()
--> 313 labels = table.get_labels(vals, uniques, 0, na_sentinel, True)
314
315 labels = _ensure_platform_int(labels)
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_labels (pandas/hashtable.c:15447)()
TypeError: unhashable type: 'list'
答案 0 :(得分:6)
你可以试试这个
df=pd.DataFrame( {'categories':[['Restaurants', 'Ramen', 'Japanese'],['Chinese', 'Seafood', 'Restaurants']]})
pd.get_dummies(df.categories.apply(pd.Series).stack()).sum(level=0)
Out[1095]:
Chinese Japanese Ramen Restaurants Seafood
0 0 1 1 1 0
1 1 0 0 1 1