我有一个由元组索引和带有float dtype的几列组成的数据框。我想以一种有效的方式平均不同(不一定是连续的)行的值。
我目前的方法是使用itupuples遍历每一行,选择要平均的行,计算平均值并将其存储在新的数据框中。
理想情况下,我想避免使用itertuples并找到矢量化的解决方案以获得更好的性能。该解决方案存在吗?
以下是itertuples方法的代码:
def parameters_list_between_range(parameters: Tuple[int, ...], param_range: Tuple[int, ...],
min_values: Tuple[int, ...]=None, max_values: Tuple[int, ...]=None) -> List[Tuple[int, ...]]:
"""
Builds a list of parameters that have as center the parameters tuple and reside between +/- (inclusive) the param_range.
I.e. if parameters = (2, 3, 1) and param_range=(0, 2, 1) it returns a list with all parameters between (2, 1, 0) and (2, 5, 2).
If min_values or max_values is provided then it restricts the results to parameters that do not surpass those limits.
:param parameters: A tuple with the parameters around which to build the list.
:param param_range: A tuple indicating for each entry the range.
:param min_values: A tuple that enforces a minimum value for each parameter.
:param max_values: A tuple that enforces a maximum value for each parameter.
:return: A list of parameters tuples.
"""
if len(parameters) == len(param_range):
min_parameter_range = tuple(parameters[i] - param_range[i] for i in range(len(parameters)))
max_parameter_range = tuple(parameters[i] + param_range[i] + 1 for i in range(len(parameters)))
if min_values is not None:
min_parameter_range = tuple(max(x, y) for x, y in zip(min_parameter_range, min_values))
if max_values is not None:
max_parameter_range = tuple(min(x, y) for x, y in zip(max_parameter_range, max_values))
min_max_parameter_range = tuple(zip(min_parameter_range, max_parameter_range))
parameters_combinatorial = product(*tuple(map(list, list(starmap(range, min_max_parameter_range)))))
else:
raise ValueError("Length of parameters and param_range is not equal.")
return list(parameters_combinatorial)
def average_neighbors(input_df: pd.DataFrame, param_range: Tuple[int, ...], min_values: Tuple[int, ...]=None, max_values: Tuple[int, ...]=None)\
-> pd.DataFrame:
"""
Averages the input_df with the neighbor values defined by the param_range.
:param input_df: A DataFrame with indexed by tuples with the measurements.
:param param_range: A tuple indicating for each parameter the distance of the maximum neighbor to average with.
:param min_values: A tuple that enforces a minimum value for each parameter.
:param max_values: A tuple that enforces a maximum value for each parameter.
:return: A Dataframe of the same shape as the input_df but where each value has been averaged with its neighbors.
"""
averaged_df = input_df.copy() # type: pd.DataFrame
# TODO: itertuples is not efficient. Does a vectorized approach exist?
for row in input_df.itertuples():
sub_df = input_df.loc[parameters_list_between_range(row[0], param_range, min_values, max_values)]
averaged_row = sub_df.mean(axis=0)
averaged_df.at[row[0],:] = averaged_row
return averaged_df
编辑: 添加一个示例input_df并得到平均的_df: 调用是:
average_neighbors(input_df, (2, 1), (5, 5), (9, 9))
input_df
1 2 3 4 5
0
(5, 5) -0.034785 0.105553 0.175304 -0.100131 -0.087695
(5, 6) -0.019643 0.007028 -0.117302 -0.188429 0.140423
(5, 7) 1.090219 0.149492 0.134205 0.298541 -0.766889
(5, 8) 0.233639 0.011140 -0.070625 0.296987 -0.555725
(5, 9) -0.160929 -0.054387 0.149795 -0.236799 -0.236427
(6, 5) -0.053750 -0.105638 0.306676 0.075424 -0.253134
(6, 6) 0.488996 -0.104845 0.037985 0.097563 0.816975
(6, 7) -0.273730 -0.006430 0.048764 0.337964 -0.558355
(6, 8) -0.279250 -0.039354 0.139512 0.312060 -0.440458
(6, 9) -0.215230 -0.077674 -0.059508 0.731031 0.146549
(7, 5) -0.447961 0.114293 0.032839 0.042610 -0.158302
(7, 6) -0.151704 -0.006943 0.053286 -0.276120 0.084298
(7, 7) 0.690647 -0.053874 0.171092 0.461872 0.164381
(7, 8) 0.431084 0.166979 0.224027 0.226597 0.116096
(7, 9) -0.548030 0.021681 0.504869 0.347923 -0.307202
(8, 5) 0.903684 0.059785 0.143359 0.423591 0.046842
(8, 6) 0.424536 -0.025986 0.089045 -0.086571 0.004112
(8, 7) -0.036251 0.172838 0.391125 -0.253686 0.049140
(8, 8) -0.251853 0.035594 0.088101 0.154551 -0.071685
(8, 9) -0.314121 -0.002571 -0.035833 0.134743 -0.115290
(9, 5) 0.083115 -0.063250 0.192984 -0.030540 0.265402
(9, 6) -0.300660 0.136081 0.203255 -0.115099 0.014861
(9, 7) -0.276533 0.031048 0.041306 0.193987 -0.504231
(9, 8) 0.336343 -0.053169 0.037187 0.081937 -0.042678
(9, 9) 0.041883 -0.178786 0.094013 -0.110578 -0.242337
averaged_df
1 2 3 4 5
0
(5, 5) -0.036474 0.001575 0.081465 -0.058180 0.090428
(5, 6) 0.143143 0.010960 0.093650 0.083255 -0.068700
(5, 7) 0.245584 0.013688 0.068994 0.174115 -0.111028
(5, 8) 0.107602 0.013064 0.138015 0.308464 -0.270892
(5, 9) -0.089786 0.004731 0.148012 0.279633 -0.212861
(6, 5) 0.138672 0.005406 0.090149 -0.001508 0.074190
(6, 6) 0.215021 0.025439 0.122198 0.069386 -0.043184
(6, 7) 0.195557 0.025470 0.099101 0.115111 -0.084807
(6, 8) 0.030516 0.026953 0.140460 0.234315 -0.214655
(6, 9) -0.138086 0.007676 0.117542 0.245887 -0.183018
(7, 5) 0.089183 0.011608 0.111743 -0.015770 0.087378
(7, 6) 0.139079 0.027277 0.126928 0.058732 -0.049478
(7, 7) 0.140389 0.027973 0.098064 0.102810 -0.103316
(7, 8) 0.031192 0.008168 0.123869 0.198475 -0.224341
(7, 9) -0.072647 -0.017055 0.107154 0.193845 -0.174916
(8, 5) 0.118282 0.000437 0.132429 0.016357 0.102632
(8, 6) 0.087532 0.012257 0.142643 0.072583 -0.002334
(8, 7) 0.066802 0.020995 0.127057 0.094588 -0.030629
(8, 8) -0.057920 0.001357 0.137055 0.218200 -0.150506
(8, 9) -0.099897 -0.015913 0.124046 0.234783 -0.119626
(9, 5) 0.085168 0.035664 0.119128 -0.007021 0.042869
(9, 6) 0.098764 0.040444 0.146477 0.040005 -0.003722
(9, 7) 0.096179 0.044730 0.144269 0.043052 -0.020634
(9, 8) 0.008130 0.015527 0.168432 0.137483 -0.105978
(9, 9) -0.050783 -0.001712 0.152061 0.139195 -0.110516