请注意,我使用的是熊猫1.1.2和numpy 1.19.2
我有一个provider_frame['NEQ']
系列,其中包含数值中的pd.NA
个数据。系列的类型为object
。
在阅读有关Int64的pandas documentation时,我了解我们应该使用pandas.NA处理缺失值。确保我的系列包含pd.NA
或数字后,我尝试以下操作:
$ provider_frame['NEQ'] = provider_frame['NEQ'].astype('Int64')
output :
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
----> 1 provider_frame['NEQ'] = provider_frame['NEQ'].astype('Int64')
/usr/local/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5541 else:
5542 # else, only a single dtype is given
-> 5543 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
5544 return self._constructor(new_data).__finalize__(self, method="astype")
5545
/usr/local/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
593 self, dtype, copy: bool = False, errors: str = "raise"
594 ) -> "BlockManager":
--> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
596
597 def convert(
/usr/local/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
404 applied = b.apply(f, **kwargs)
405 else:
--> 406 applied = getattr(b, f)(**kwargs)
407 result_blocks = _extend_blocks(applied, result_blocks)
408
/usr/local/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
592 vals1d = values.ravel()
593 try:
--> 594 values = astype_nansafe(vals1d, dtype, copy=True)
595 except (ValueError, TypeError):
596 # e.g. astype_nansafe can fail on object-dtype of strings
/usr/local/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
912 # dispatch on extension dtype if needed
913 if is_extension_array_dtype(dtype):
--> 914 return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
915
916 if not isinstance(dtype, np.dtype):
/usr/local/lib/python3.8/site-packages/pandas/core/arrays/integer.py in _from_sequence(cls, scalars, dtype, copy)
367 @classmethod
368 def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "IntegerArray":
--> 369 return integer_array(scalars, dtype=dtype, copy=copy)
370
371 @classmethod
/usr/local/lib/python3.8/site-packages/pandas/core/arrays/integer.py in integer_array(values, dtype, copy)
158 TypeError if incompatible types
159 """
--> 160 values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
161 return IntegerArray(values, mask)
162
/usr/local/lib/python3.8/site-packages/pandas/core/arrays/integer.py in coerce_to_array(values, dtype, mask, copy)
242 "mixed-integer-float",
243 ]:
--> 244 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
245
246 elif is_bool_dtype(values) and is_integer_dtype(dtype):
TypeError: object cannot be converted to an IntegerDtype
从这个post来看,我了解到由于熊猫问题,我们应该进行两次转换,首先转换为float,然后转换为Int64。
尝试进行转换时,我具有以下追溯:
$ provider_frame['NEQ'] = provider_frame['NEQ'].astype('float')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
----> 1 provider_frame['NEQ'] = provider_frame['NEQ'].astype('float')
/usr/local/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5541 else:
5542 # else, only a single dtype is given
-> 5543 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
5544 return self._constructor(new_data).__finalize__(self, method="astype")
5545
/usr/local/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
593 self, dtype, copy: bool = False, errors: str = "raise"
594 ) -> "BlockManager":
--> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
596
597 def convert(
/usr/local/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
404 applied = b.apply(f, **kwargs)
405 else:
--> 406 applied = getattr(b, f)(**kwargs)
407 result_blocks = _extend_blocks(applied, result_blocks)
408
/usr/local/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
592 vals1d = values.ravel()
593 try:
--> 594 values = astype_nansafe(vals1d, dtype, copy=True)
595 except (ValueError, TypeError):
596 # e.g. astype_nansafe can fail on object-dtype of strings
/usr/local/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
988 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
989 # Explicit copy, or required since NumPy can't view from / to object.
--> 990 return arr.astype(dtype, copy=True)
991
992 return arr.view(dtype)
TypeError: float() argument must be a string or a number, not 'NAType'
我知道float类型不喜欢pd.NA
类型。
现在,我将使用pd.NA
将我的所有np.nan
行转换为provider_frame.loc[provider_frame['NEQ'].isna() == True, 'NEQ']=np.NaN
然后,我将执行以下代码:
provider_frame['NEQ'] = provider_frame['NEQ'].astype('float')
provider_frame['NEQ'] = provider_frame['NEQ'].astype('Int64')
And my conversion will work successfully. If I had tried directly to do
provider_frame ['NEQ'] = provider_frame ['NEQ']。astype('Int64')`,但没有浮动步骤,输出为:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
----> 1 provider_frame['NEQ'] = provider_frame['NEQ'].astype('Int64')
/usr/local/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5541 else:
5542 # else, only a single dtype is given
-> 5543 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
5544 return self._constructor(new_data).__finalize__(self, method="astype")
5545
/usr/local/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
593 self, dtype, copy: bool = False, errors: str = "raise"
594 ) -> "BlockManager":
--> 595 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
596
597 def convert(
/usr/local/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
404 applied = b.apply(f, **kwargs)
405 else:
--> 406 applied = getattr(b, f)(**kwargs)
407 result_blocks = _extend_blocks(applied, result_blocks)
408
/usr/local/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
592 vals1d = values.ravel()
593 try:
--> 594 values = astype_nansafe(vals1d, dtype, copy=True)
595 except (ValueError, TypeError):
596 # e.g. astype_nansafe can fail on object-dtype of strings
/usr/local/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
912 # dispatch on extension dtype if needed
913 if is_extension_array_dtype(dtype):
--> 914 return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
915
916 if not isinstance(dtype, np.dtype):
/usr/local/lib/python3.8/site-packages/pandas/core/arrays/integer.py in _from_sequence(cls, scalars, dtype, copy)
367 @classmethod
368 def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "IntegerArray":
--> 369 return integer_array(scalars, dtype=dtype, copy=copy)
370
371 @classmethod
/usr/local/lib/python3.8/site-packages/pandas/core/arrays/integer.py in integer_array(values, dtype, copy)
158 TypeError if incompatible types
159 """
--> 160 values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
161 return IntegerArray(values, mask)
162
/usr/local/lib/python3.8/site-packages/pandas/core/arrays/integer.py in coerce_to_array(values, dtype, mask, copy)
242 "mixed-integer-float",
243 ]:
--> 244 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
245
246 elif is_bool_dtype(values) and is_integer_dtype(dtype):
TypeError: object cannot be converted to an IntegerDtype
转换过程中显然有些我不理解的地方。将包含pd.NA
行的系列转换为Int64类型的最佳方法是什么? pd.NA
的目的不是为了提供一种简单的方法来转换为Int64类型吗?
答案 0 :(得分:0)
这是将数字的字符串版本转换为Int64的一种方法:
import pandas as pd
df = pd.DataFrame({'x': ['10', '20', None, '40']}) # list of strings + None
df['x'] = pd.to_numeric(df['x'], downcast='float', errors='raise').astype('Int64')
print(df)
x
0 10
1 20
2 <NA>
3 40
您也许可以消除to_numeric()
中的downcast和/或errors参数,但是此版本有效(pandas 1.1.0版)。