我的示例显示了使用float的DataFrame 时,在某些情况下查询可能比使用掩码更快。当你看图表,查询功能更好地执行当条件由1至5子条件。
编辑(感谢a_guest):当条件由1到5个子条件组成时,掩码功能会更好地显示
然后,这两种方法之间是否存在任何差异,因为在子条件数量上,它趋向于具有相同的趋势。
用于绘制数据的函数:
import matplotlib.pyplot as plt
def graph(data):
t = [int(i) for i in range(1, len(data["mask"]) + 1)]
plt.xlabel('Number of conditions')
plt.ylabel('timeit (ms)')
plt.title('Benchmark mask vs query')
plt.grid(True)
plt.plot(t, data["mask"], 'r', label="mask")
plt.plot(t, data["query"], 'b', label="query")
plt.xlim(1, len(data["mask"]))
plt.legend()
plt.show()
用于创建要通过timeit测试的条件的函数:
def create_multiple_conditions_mask(columns, nb_conditions, condition):
mask_list = []
for i in range(nb_conditions):
mask_list.append("(df['" + columns[i] + "']" + " " + condition + ")")
return " & ".join(mask_list)
def create_multiple_conditions_query(columns, nb_conditions, condition):
mask_list = []
for i in range(nb_conditions):
mask_list.append(columns[i] + " " + condition)
return "'" + " and ".join(mask_list) + "'"
使用包含float的pandas DataFrame对掩蔽与查询进行基准测试的功能:
def benchmarks_mask_vs_query(dim_df=(50,10), labels=[], condition="> 0", random=False):
# init local variable
time_results = {"mask": [], "query": []}
nb_samples, nb_columns = dim_df
all_labels = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
if nb_columns > 26:
if len(labels) == nb_columns:
all_labels = labels
else:
raise Exception("labels length must match nb_columns" )
df = pd.DataFrame(np.random.randn(nb_samples, nb_columns), columns=all_labels[:nb_columns])
for col in range(nb_columns):
if random:
condition = "<" + str(np.random.random(1)[0])
mask = "df[" + create_multiple_conditions_mask(df.columns, col+1, condition) + "]"
query = "df.query(" + create_multiple_conditions_query(df.columns, col+1, condition) + ")"
print("Parameters: nb_conditions=" + str(col+1) + ", condition= " + condition)
print("Mask created: " + mask)
print("Query created: " + query)
print()
result_mask = timeit(mask, number=100, globals=locals()) * 10
result_query = timeit(query, number=100, globals=locals()) * 10
time_results["mask"].append(result_mask)
time_results["query"].append(result_query)
return time_results
我跑的是什么
# benchmark on a DataFrame of shape(50,25) populating with random values
# as well as the conditions ("<random_value")
data = benchmarks_mask_vs_query((50,25), random=True)
graph(data)
我得到的是