>>> arr
array([[ 0., 10., 0., ..., 0., 0., 0.],
[ 0., 4., 0., ..., 6., 0., 9.],
[ 0., 0., 0., ..., 0., 0., 0.],
...,
[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 2., 0., 0.],
[ 0., 0., 0., ..., 0., 3., 0.]])
在上面的numpy数组中,我想将数据框(df_A)中与列country_codes
匹配的每个值替换为df_A中列continent_codes
的值。 df_A看起来像:
country_codes continent_codes
0 4 4
1 8 3
2 12 5
3 16 6
4 24 5
现在,我遍历数据框并使用numpy索引表示法替换。鉴于iterrows()往往很慢,是否有更直接/矢量化的方法来做到这一点?
for index, row in self.df_A.iterrows():
arr[arr == row['country_codes']] = row['continent_codes']
答案 0 :(得分:1)
方法#1:使用np.searchsorted
和np.in1d
的一种矢量化方法如下所示 -
# Store country_codes and continent_codes column data for further usage
oldval = np.array(df['country_codes'])
newval = np.array(df['continent_codes'])
# Mask of elements to be changed
mask = np.in1d(arr,oldval)
# Indices for each match from oldval in arr
idx = np.searchsorted(oldval,arr.ravel()[mask])
# Using the mask put selective elements from continent_codes column into arr
arr.ravel()[mask] = newval[idx]
示例运行 -
>>> arr # Original 2D array
array([[23, 4, 23, 5, 8],
[ 3, 6, 8, 5, 11],
[16, 24, 15, 4, 10],
[ 4, 16, 10, 8, 1]])
>>> df
country_codes continent_codes
0 4 4
1 8 3
2 12 5
3 16 6
4 24 5
>>> oldval = np.array(df['country_codes'])
>>> newval = np.array(df['continent_codes'])
>>> mask = np.in1d(arr,oldval)
>>> idx = np.searchsorted(oldval,arr.ravel()[mask])
>>> arr.ravel()[mask] = newval[idx]
>>> mask.reshape(arr.shape) # Mask array depiciting which elements were updated
array([[False, True, False, False, True],
[False, False, True, False, False],
[ True, True, False, True, False],
[ True, True, False, True, False]], dtype=bool)
>>> arr # Updated 2D array
array([[23, 4, 23, 5, 3],
[ 3, 6, 3, 5, 11],
[ 6, 5, 15, 4, 10],
[ 4, 6, 10, 3, 1]])
方法#2:作为变体,您还可以创建掩码,并在np.searchsorted(oldval,arr,'left')
和np.searchsorted(oldval,arr,'right')
之间进行比较,如this question
解决方案中所述稍后再次使用np.searchsorted(oldval,arr,'left')
,同时将值放入arr
以获得更有效的解决方案,如此 -
# Store country_codes and continent_codes column data for further usage
oldval = np.array(df['country_codes'])
newval = np.array(df['continent_codes'])
# Left and right indices for each match from oldval in arr
left_idx = np.searchsorted(oldval,arr,'left')
right_idx = np.searchsorted(oldval,arr,'right')
# Mask of elements to be changed
mask = left_idx!=right_idx
# Using the mask put selective elements from continent_codes column into arr
arr[mask] = newval[left_idx[mask]]
运行时测试并验证输出
功能定义 -
def original_app(arr,df):
for index, row in df.iterrows():
arr[arr == row['country_codes']] = row['continent_codes']
def vectorized_app1(arr,df):
oldval = np.array(df['country_codes'])
newval = np.array(df['continent_codes'])
mask = np.in1d(arr,oldval)
idx = np.searchsorted(oldval,arr.ravel()[mask])
arr.ravel()[mask] = newval[idx]
def vectorized_app2(arr,df):
oldval = np.array(df['country_codes'])
newval = np.array(df['continent_codes'])
left_idx = np.searchsorted(oldval,arr,'left')
right_idx = np.searchsorted(oldval,arr,'right')
mask = left_idx!=right_idx
arr[mask] = newval[left_idx[mask]]
验证输出 -
In [195]: # Input array
...: arr = np.random.randint(0,100000,(1000,1000))
...:
...: # Setup input dataframe
...: N = 1000
...: oldvals = np.unique(np.random.randint(0,100000,N))
...: newvals = np.random.randint(0,9,(oldvals.size))
...: df=pd.DataFrame({'country_codes':oldvals,'continent_codes':newvals})
...: df = df.reindex_axis(sorted(df.columns)[::-1], axis=1)
...:
...: # Make copies for input array for funcs to update them
...: arrc1 = arr.copy()
...: arrc2 = arr.copy()
...: arrc3 = arr.copy()
...:
In [196]: # Verify outputs
...: original_app(arrc1,df)
...: vectorized_app1(arrc2,df)
...: vectorized_app2(arrc3,df)
...:
In [197]: np.allclose(arrc1,arrc2)
Out[197]: True
In [198]: np.allclose(arrc1,arrc3)
Out[198]: True
计时 -
In [199]: # Make copies for input array for funcs to update them
...: arrc1 = arr.copy()
...: arrc2 = arr.copy()
...: arrc3 = arr.copy()
...:
In [200]: %timeit original_app(arrc1,df)
1 loops, best of 3: 2.79 s per loop
In [201]: %timeit vectorized_app1(arrc2,df)
1 loops, best of 3: 360 ms per loop
In [202]: %timeit vectorized_app2(arrc3,df)
1 loops, best of 3: 213 ms per loop
答案 1 :(得分:1)
以此数据为例,最多有N个国家/地区
N=10**5
values=np.random.randint(0,N,(1000,1000))
exemple={'country':np.arange(N//2),'continent':randint(1,5,N//2)}
df=pd.DataFrame.from_dict(exemple)
你可以这样做:
v=np.arange(N)
v[df['country']]=df['continent']
v.take(values,out=values)
可能不是最优的,但效率很高(20ms)。