如果我想告诉pandas
它应该考虑缺少负值,我会
df[df < 0] = None
但是,df[df < 0]
给了我一个错误:
Traceback (most recent call last):
File "/usr/lib/pycharm/helpers/pydev/pydevd.py", line 2411, in <module>
globals = debugger.run(setup['file'], None, None, is_module)
File "/usr/lib/pycharm/helpers/pydev/pydevd.py", line 1802, in run
launch(file, globals, locals) # execute the script
File "/usr/lib/pycharm/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "foo.py", line 47, in <module>
df[df < 0] = None
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/frame.py", line 2296, in __setitem__
self._setitem_frame(key, value)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/frame.py", line 2334, in _setitem_frame
self.where(-key, value, inplace=True)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/generic.py", line 4058, in where
transpose=self._AXIS_REVERSED)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/internals.py", line 2846, in putmask
return self.apply('putmask', **kwargs)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/internals.py", line 2820, in apply
copy=align_copy)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/frame.py", line 2692, in reindex_axis
fill_value=fill_value)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/generic.py", line 2049, in reindex_axis
{axis: [new_index, indexer]}, fill_value=fill_value, copy=copy)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/generic.py", line 2073, in _reindex_with_indexers
copy=copy)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/internals.py", line 3503, in reindex_indexer
self.axes[axis]._can_reindex(indexer)
File "/usr/local/anaconda2/envs/myenv3/lib/python3.5/site-packages/pandas/core/index.py", line 2086, in _can_reindex
raise ValueError("cannot reindex from a duplicate axis")
ValueError: cannot reindex from a duplicate axis
df.info()
:
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 90350 entries, 0 to 90349
Columns: 177 entries, V2 to V227
dtypes: int16(1), int32(1), int8(175)
memory usage: 16.3 MB
此外,索引是唯一的:
>>> df.index.has_duplicates
Out[9]: False
以下是我的df的一些输出:发生了什么?
>>> df.head()
Out[5]:
V2 V4 V5 V6 V7 V8 V9 V10 V11 V23 ... V218 V219 V220 V221 \
0 12 1 1 1 -2 1 1 2 1 8 ... 4 2 2 3
1 12 1 2 3 4 2 2 2 2 5 ... 3 4 3 4
2 12 1 3 2 4 2 1 2 2 4 ... 4 2 2 4
3 12 1 1 3 4 3 1 2 1 8 ... 3 1 2 3
4 12 1 1 1 2 1 1 1 3 8 ... 4 2 2 4
V222 V223 V224 V225 V226 V227
0 4 2 4 2 3 3
1 2 3 3 3 2 3
2 3 1 1 2 2 3
3 4 3 3 2 2 3
4 3 2 3 2 3 3
[5 rows x 177 columns]
>>> (df < 0).head()
Out[6]:
V2 V4 V5 V6 V7 V8 V9 V10 V11 V23 \
0 False False False False True False False False False False
1 False False False False False False False False False False
2 False False False False False False False False False False
3 False False False False False False False False False False
4 False False False False False False False False False False
... V218 V219 V220 V221 V222 V223 V224 V225 V226 V227
0 ... False False False False False False False False False False
1 ... False False False False False False False False False False
2 ... False False False False False False False False False False
3 ... False False False False False False False False False False
4 ... False False False False False False False False False False
[5 rows x 177 columns]
然后做
import pandas as pd
df = pd.read_stata('data/WV6_Stata_v_2016_01_01.dta', convert_categoricals=False)
columnIds = list(range(4, 12)) + list(range(23, 25)) + list(range(45, 55)) + \
list(range(55, 57)) + list(range(59, 67))
columnIds += list(range(67, 70)) + list(range(82, 95)) + [95] + list(range(96, 102)) + \
list(range(102, 125)) + list(range(127, 144))
columnIds += list(range(143, 157)) + list(range(157, 211)) + list(range(211, 215)) + \
list(range(217, 226)) + list(range(226, 228))
columns = ['V2'] + ['V'+str(id) for id in columnIds]
df = df[columns]
df.set_index('V2', inplace=True)
df[df < 0] = None