尝试#1
s["order_id"].apply(lambda x: int(x) if pd.notnull(x) else np.nan)
尝试#2
def to_int(x):
if(pd.notnull(x)):
return int(x)
尝试#3
s["order_id"] = s.loc[pd.notnull(s["order_id"]),"order_id].astype(int)
所有这些都返回一个序列,其中值仍然被格式化为浮点数。
我想知道我是否可以使用更新功能或利用重新索引。
利用索引解决方案的尝试:
null = np.nan
data = {"time":{"0":1528971021539,"1":1529289904697,"2":1529572773525,"3":1529892602301,"4":1530082881098,"5":1530069453264,"6":1528985491630,"7":1529236762719,"8":1529475504491,"9":1529814085541,"10":1529906568681,"11":1530160346468,"12":1529833559160,"13":1530051985183,"14":1530240956273,"15":1529794554495,"16":1529892989425,"17":1529386510176,"18":1529118607780,"19":1529404958912,"20":1529812956409,"21":1530012703548,"22":1527815420250,"23":1527826735070,"24":1527832343938,"25":1527853694229,"26":1527889066223,"27":1527986243670,"28":1528070794031,"29":1528149294729,"30":1528158483701,"31":1528172242288,"32":1528173686892,"33":1528174729282,"34":1528175624472,"35":1528184014365,"36":1528184994544,"37":1528199211274,"38":1528204822424,"39":1528236692102,"40":1528246124079,"41":1528251449061,"42":1528254158311,"43":1528324045380,"44":1528409837346,"45":1528429172972,"46":1528453372400,"47":1528525996756,"48":1528530493509,"49":1528539093472},"user_id":{"0":1754627236948496,"1":4702200191313171,"2":4778254911976758,"3":8293985621789157,"4":5156436454415407,"5":4445821205748907,"6":6872300957263521,"7":579402494860,"8":2010389994610194,"9":3378398685582335,"10":2923987501904097,"11":7254681572754712,"12":2280706641994510,"13":5853777483445659,"14":1790488830140089,"15":4649841298300342,"16":8296801793054868,"17":6074985077237804,"18":7512067556495704,"19":7449962479289671,"20":931159100938705,"21":4303206141550631,"22":4931136210605885,"23":910152652690726,"24":213367265258802,"25":59665205254502,"26":7375134691043656,"27":5112755499047871,"28":1511225869347102,"29":6553192205018264,"30":5758319280291333,"31":5654341500640968,"32":8149628703137465,"33":6808112291514009,"34":3363098540596606,"35":4205809380744263,"36":3662128280212665,"37":986809097179824,"38":3834989038766064,"39":3561701388137551,"40":3363098540596606,"41":7998995390673240,"42":188780187662080,"43":290955994841187,"44":7996996554339358,"45":2624074855751159,"46":8317830532715985,"47":4819555707307085,"48":6662202062763635,"49":1363740504674809},"order_id":{"0":1161.0,"1":1175.0,"2":1186.0,"3":1200.0,"4":1217.0,"5":1213.0,"6":1162.0,"7":1171.0,"8":1183.0,"9":1192.0,"10":1205.0,"11":1219.0,"12":1195.0,"13":1212.0,"14":1221.0,"15":1190.0,"16":1201.0,"17":1166.0,"18":1167.0,"19":1181.0,"20":1191.0,"21":1211.0,"22":null,"23":null,"24":null,"25":null,"26":null,"27":null,"28":null,"29":null,"30":null,"31":null,"32":null,"33":null,"34":null,"35":null,"36":null,"37":null,"38":null,"39":null,"40":null,"41":null,"42":null,"43":null,"44":null,"45":null,"46":null,"47":null,"48":null,"49":null}}
s = pd.DataFrame(data=data)
orders = {"order_id":{"0":1161,"1":1175,"2":1205,"3":1219,"4":1195,"5":1212,"6":1221,"7":1190,"8":1201,"9":1166,"10":1167,"11":1181,"12":1186,"13":1191,"14":1211,"15":1200,"16":1217,"17":1213,"18":1162,"19":1171,"20":1183,"21":1192},"order_total":{"0":"206.50","1":"369.00","2":"313.65","3":"158.74","4":"164.50","5":"156.83","6":"184.50","7":"137.50","8":"120.00","9":"85.00","10":"369.00","11":"156.83","12":"184.50","13":"191.25","14":"297.50","15":"180.00","16":"394.40","17":"75.00","18":"191.25","19":"386.33","20":"95.00","21":"200.00"}}
o = pd.DataFrame(data=orders)
orders = s.loc[pd.notnull(s["order_id"])]
orders["order_id"] = orders["order_id"].astype(int)
s["order_total"] = np.nan
s.update(orders.merge(o, on='order_id', how='left').set_index(o.index)["order_total"])
答案 0 :(得分:3)
可能会被黑客入侵,但不建议这样做,因为某些功能可能会失败并降低性能:
s = pd.DataFrame({'order_id':[np.nan,8,9,4,2,3]})
s["order_id"] = s["order_id"].astype(object)
print (s)
order_id
0 NaN
1 8
2 9
3 4
4 2
5 3
Docs:
在NumPy中根本没有内置高性能NA支持的情况下,主要的伤亡是能够以整数数组表示NA。
In [20]: s = pd.Series([1, 2, 3, 4, 5], index=list('abcde'))
In [21]: s
Out[21]:
a 1
b 2
c 3
d 4
e 5
dtype: int64
In [22]: s.dtype
Out[22]: dtype('int64')
In [23]: s2 = s.reindex(['a', 'b', 'c', 'f', 'u'])
In [24]: s2
Out[24]:
a 1.0
b 2.0
c 3.0
f NaN
u NaN
dtype: float64
In [25]: s2.dtype
Out[25]: dtype('float64')
进行这种权衡主要是出于内存和性能方面的考虑,并且也使得最终的Series继续为“数字”。一种可能性是改用 dtype = object 数组。
编辑:
如果NaN
和float
之间的连接merge
失败,则可以通过dropna
删除float
,然后转换为integer
:
orders = s = s.dropna(subset=['order_id'])
orders['order_id'] = orders['order_id'].astype(int)
#if want select only one column there was typos - ] and ) after s["order_id"]
orders = s.loc[pd.notnull(s["order_id"]),"order_id"].astype(int)
orders.merge(df, on="order_id", how="left")
EDIT1:
orders = o.set_index('order_id')["order_total"]
s["order_total"] = s["order_id"].map(orders)
print (s.head(20))
time user_id order_id order_total
0 1528971021539 1754627236948496 1161.0 206.50
1 1529289904697 4702200191313171 1175.0 369.00
10 1529906568681 2923987501904097 1205.0 313.65
11 1530160346468 7254681572754712 1219.0 158.74
12 1529833559160 2280706641994510 1195.0 164.50
13 1530051985183 5853777483445659 1212.0 156.83
14 1530240956273 1790488830140089 1221.0 184.50
15 1529794554495 4649841298300342 1190.0 137.50
16 1529892989425 8296801793054868 1201.0 120.00
17 1529386510176 6074985077237804 1166.0 85.00
18 1529118607780 7512067556495704 1167.0 369.00
19 1529404958912 7449962479289671 1181.0 156.83
2 1529572773525 4778254911976758 1186.0 184.50
20 1529812956409 931159100938705 1191.0 191.25
21 1530012703548 4303206141550631 1211.0 297.50
22 1527815420250 4931136210605885 NaN NaN
23 1527826735070 910152652690726 NaN NaN
24 1527832343938 213367265258802 NaN NaN
25 1527853694229 59665205254502 NaN NaN
26 1527889066223 7375134691043656 NaN NaN