Question

如何删除iterrows()？ numpy或pandas可以更快地完成吗？

import pandas as pd
import numpy as np
df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),
                   'B': 'one one two three two two one three'.split(),
                   'C': np.arange(8)*0  })
print(df)
#      A      B  C
# 0  foo    one  0
# 1  bar    one  0
# 2  foo    two  0
# 3  bar  three  0
# 4  foo    two  0
# 5  bar    two  0
# 6  foo    one  0
# 7  foo  three  0

selDict = {"foo":2, "bar":3}

这有效：

for i, r in df.iterrows():
    if selDict[r["A"]] > 0:
        selDict[r["A"]] -=1         
        df.set_value(i, 'C', 1)

   print df
#      A      B  C
# 0  foo    one  1
# 1  bar    one  1
# 2  foo    two  1
# 3  bar  three  1
# 4  foo    two  0
# 5  bar    two  1
# 6  foo    one  0
# 7  foo  three  0

Answer 1

如果我理解正确，你可以使用cumcount：

df['C'] = (df.groupby('A').cumcount() < df['A'].map(selDict)).astype('int')

df
Out: 
     A      B  C
0  foo    one  1
1  bar    one  1
2  foo    two  1
3  bar  three  1
4  foo    two  0
5  bar    two  1
6  foo    one  0
7  foo  three  0

Answer 2

这是一种方法 -

1）助手功能：

def argsort_unique(idx):
    # Original idea : http://stackoverflow.com/a/41242285/3293881 by @Andras
    n = idx.size
    sidx = np.empty(n,dtype=int)
    sidx[idx] = np.arange(n)
    return sidx

def get_bin_arr(grplens, stop1_idx):
    count_stops_corr = np.minimum(stop1_idx, grplens)

    limsc = np.maximum(grplens, count_stops_corr)
    L = limsc.sum()

    starts = np.r_[0,limsc[:-1].cumsum()]

    shift_arr = np.zeros(L,dtype=int)
    stops = starts + count_stops_corr
    stops = stops[stops<L]

    shift_arr[starts] += 1
    shift_arr[stops] -= 1
    bin_arr = shift_arr.cumsum()
    return bin_arr

使用基于循环切片的辅助函数可能更快的替代方案：

def get_bin_arr(grplens, stop1_idx):
    stop1_idx_corr = np.minimum(stop1_idx, grplens)    
    clens = grplens.cumsum()
    out = np.zeros(clens[-1],dtype=int)    
    out[:stop1_idx_corr[0]] = 1
    for i,j in zip(clens[:-1], clens[:-1] + stop1_idx_corr[1:]):
        out[i:j] = 1
    return out

2）主要功能：

def out_C(A, selDict):
    k = np.array(selDict.keys())
    v = np.array(selDict.values())
    unq, C  = np.unique(A, return_counts=1)
    sidx3 = np.searchsorted(unq, k)
    lims = np.zeros(len(unq),dtype=int)
    lims[sidx3] = v
    bin_arr = get_bin_arr(C, lims)
    sidx2 = A.argsort()
    out = bin_arr[argsort_unique(sidx2)]    
    return out

样品运行 -

原创方法：

def org_app(df, selDict):
    df['C'] = 0
    d = selDict.copy()    
    for i, r in df.iterrows():
        if d[r["A"]] > 0:
            d[r["A"]] -=1         
            df.set_value(i, 'C', 1)
    return df

案例＃1：

>>> df = pd.DataFrame({'A': 'foo bar foo bar res foo bar res foo foo res'.split()})
>>> selDict = {"foo":2, "bar":3, "res":1}
>>> org_app(df, selDict)
      A  C
0   foo  1
1   bar  1
2   foo  1
3   bar  1
4   res  1
5   foo  0
6   bar  1
7   res  0
8   foo  0
9   foo  0
10  res  0
>>> out_C(df.A.values, selDict)
array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])

案例＃2：

>>> selDict = {"foo":20, "bar":30, "res":10}
>>> org_app(df, selDict)
      A  C
0   foo  1
1   bar  1
2   foo  1
3   bar  1
4   res  1
5   foo  1
6   bar  1
7   res  1
8   foo  1
9   foo  1
10  res  1
>>> out_C(df.A.values, selDict)
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

Answer 3

scipy.stats.rankdata可以提供帮助。为了得出其桶中每个元素的等级，我们采用＆＃34; min＆＃34;之间的差异。和＆＃34;序数＆＃34;方法：

>>> from scipy.stats import rankdata as rd
>>> rd(df.A, 'ordinal') - rd(df.A, 'min')
array([0, 0, 1, 1, 2, 2, 3, 4])

然后我们只是与df.A.map(selDict)进行比较：

df.C = (rd(df.A, 'ordinal') - rd(df.A, 'min') < df.A.map(selDict)).astype(int)

这可能效率不高（调用rankdata两次），但在scipy中使用优化的例程应该可以弥补这一点。

如果你不能使用scipy，你可以使用重复的argsort()作为＆＃34;序数＆＃34;方法和我的解决方案使用unique和bincount进行＆＃34; min＆＃34;方法：

>>> _, v = np.unique(df.A, return_inverse=True)
>>> df.A.argsort().argsort() - (np.cumsum(np.concatenate(([0], np.bincount(v)))))[v]
0    0
1    0
2    1
3    1
4    2
5    2
6    3
7    4
Name: A, dtype: int64

然后与上面的df.A.map(selDict)进行比较。

Pandas：将列的值分配到字典值设置的限制

3 个答案: