我有一个int32整数矩阵(50k x 50k左右),需要将其转换为float32。我可以做到
# Preparation for the example
import numpy as np
n = 50_000
matrix = np.random.randint(0, 10, (n, n), dtype='int32')
# Way 1:
matrix = matrix.astype(np.float32, copy=False)
# Way 2:
matrix = matrix.view(np.float32)
我什么时候应该使用哪个?与“实际”的numpy数组相比,在以后使用视图时是否存在速度上的劣势?
import numpy as np
import timeit
def create_boxplot(duration_list, showfliers=False):
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.figure(num=None, figsize=(8, 4), dpi=300, facecolor="w", edgecolor="k")
sns.set(style="whitegrid")
sorted_keys, sorted_vals = zip(
*sorted(duration_list.items(), key=operator.itemgetter(1))
)
flierprops = dict(markerfacecolor="0.75", markersize=1, linestyle="none")
ax = sns.boxplot(
data=sorted_vals,
width=0.3,
orient="h",
flierprops=flierprops,
showfliers=showfliers,
)
ax.set(xlabel="Time in ms", ylabel="")
plt.yticks(plt.yticks()[0], sorted_keys)
plt.tight_layout()
plt.savefig("output.png")
n = 5_000
matrix = np.random.randint(0, 2, (n, n), dtype='int32')
print(matrix.dtype)
matrix = matrix.view(np.float32)
print(matrix.dtype)
timeit_d = {}
timeit_d["repeat"] = 500
timeit_d["number"] = 3
timeit_d["setup"] = "import numpy as np; n=5_000; matrix = np.random.randint(0, 2, (n, n), dtype='int32')"
duration_list = {}
# Way 1
durations = timeit.repeat(
"matrix2 = matrix.view(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["view"] = durations
print("Done views")
# Way 2
durations = timeit.repeat(
"matrix2 = matrix.astype(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["astype"] = durations
print("Done astype")
# Visualize
create_boxplot(duration_list)
很明显,视图比astype快。
$ valgrind --tool=massif python3 foobar.py
$ massif-visualizer massif.out.view
清楚地表明,view
选项使用的内存更少。