如何删除iterrows()
? numpy或pandas可以更快地完成吗?
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),
'B': 'one one two three two two one three'.split(),
'C': np.arange(8)*0 })
print(df)
# A B C
# 0 foo one 0
# 1 bar one 0
# 2 foo two 0
# 3 bar three 0
# 4 foo two 0
# 5 bar two 0
# 6 foo one 0
# 7 foo three 0
selDict = {"foo":2, "bar":3}
这有效:
for i, r in df.iterrows():
if selDict[r["A"]] > 0:
selDict[r["A"]] -=1
df.set_value(i, 'C', 1)
print df
# A B C
# 0 foo one 1
# 1 bar one 1
# 2 foo two 1
# 3 bar three 1
# 4 foo two 0
# 5 bar two 1
# 6 foo one 0
# 7 foo three 0
答案 0 :(得分:4)
如果我理解正确,你可以使用cumcount:
df['C'] = (df.groupby('A').cumcount() < df['A'].map(selDict)).astype('int')
df
Out:
A B C
0 foo one 1
1 bar one 1
2 foo two 1
3 bar three 1
4 foo two 0
5 bar two 1
6 foo one 0
7 foo three 0
答案 1 :(得分:2)
这是一种方法 -
1)助手功能:
def argsort_unique(idx):
# Original idea : http://stackoverflow.com/a/41242285/3293881 by @Andras
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def get_bin_arr(grplens, stop1_idx):
count_stops_corr = np.minimum(stop1_idx, grplens)
limsc = np.maximum(grplens, count_stops_corr)
L = limsc.sum()
starts = np.r_[0,limsc[:-1].cumsum()]
shift_arr = np.zeros(L,dtype=int)
stops = starts + count_stops_corr
stops = stops[stops<L]
shift_arr[starts] += 1
shift_arr[stops] -= 1
bin_arr = shift_arr.cumsum()
return bin_arr
使用基于循环切片的辅助函数可能更快的替代方案:
def get_bin_arr(grplens, stop1_idx):
stop1_idx_corr = np.minimum(stop1_idx, grplens)
clens = grplens.cumsum()
out = np.zeros(clens[-1],dtype=int)
out[:stop1_idx_corr[0]] = 1
for i,j in zip(clens[:-1], clens[:-1] + stop1_idx_corr[1:]):
out[i:j] = 1
return out
2)主要功能:
def out_C(A, selDict):
k = np.array(selDict.keys())
v = np.array(selDict.values())
unq, C = np.unique(A, return_counts=1)
sidx3 = np.searchsorted(unq, k)
lims = np.zeros(len(unq),dtype=int)
lims[sidx3] = v
bin_arr = get_bin_arr(C, lims)
sidx2 = A.argsort()
out = bin_arr[argsort_unique(sidx2)]
return out
样品运行 -
原创方法:
def org_app(df, selDict):
df['C'] = 0
d = selDict.copy()
for i, r in df.iterrows():
if d[r["A"]] > 0:
d[r["A"]] -=1
df.set_value(i, 'C', 1)
return df
案例#1:
>>> df = pd.DataFrame({'A': 'foo bar foo bar res foo bar res foo foo res'.split()})
>>> selDict = {"foo":2, "bar":3, "res":1}
>>> org_app(df, selDict)
A C
0 foo 1
1 bar 1
2 foo 1
3 bar 1
4 res 1
5 foo 0
6 bar 1
7 res 0
8 foo 0
9 foo 0
10 res 0
>>> out_C(df.A.values, selDict)
array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
案例#2:
>>> selDict = {"foo":20, "bar":30, "res":10}
>>> org_app(df, selDict)
A C
0 foo 1
1 bar 1
2 foo 1
3 bar 1
4 res 1
5 foo 1
6 bar 1
7 res 1
8 foo 1
9 foo 1
10 res 1
>>> out_C(df.A.values, selDict)
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
答案 2 :(得分:1)
scipy.stats.rankdata
可以提供帮助。为了得出其桶中每个元素的等级,我们采用&#34; min&#34;之间的差异。和&#34;序数&#34;方法:
>>> from scipy.stats import rankdata as rd
>>> rd(df.A, 'ordinal') - rd(df.A, 'min')
array([0, 0, 1, 1, 2, 2, 3, 4])
然后我们只是与df.A.map(selDict)
进行比较:
df.C = (rd(df.A, 'ordinal') - rd(df.A, 'min') < df.A.map(selDict)).astype(int)
这可能效率不高(调用rankdata两次),但在scipy中使用优化的例程应该可以弥补这一点。
如果你不能使用scipy,你可以使用重复的argsort()
作为&#34;序数&#34;方法和我的解决方案使用unique
和bincount
进行&#34; min&#34;方法:
>>> _, v = np.unique(df.A, return_inverse=True)
>>> df.A.argsort().argsort() - (np.cumsum(np.concatenate(([0], np.bincount(v)))))[v]
0 0
1 0
2 1
3 1
4 2
5 2
6 3
7 4
Name: A, dtype: int64
然后与上面的df.A.map(selDict)
进行比较。