我有这个代码:
import numpy as np
from skimage.util import img_as_ubyte
from skimage.feature import canny
import math
image = img_as_ubyte(sf_img)
edges = np.flipud(canny(image, sigma=3, low_threshold=10, high_threshold=25))
non_zeros = np.nonzero(edges)
true_rows = non_zeros[0]
true_col = non_zeros[1]
plt.imshow(edges)
plt.show()
N_im = 256
x0 = 0
y0 = -0.25
Npx = 129
Npy = 60
delta_py = 0.025
delta_px = 0.031
Nr = 9
delta_r = 0.5
rho = 0.063
epsilon = 0.75
r_k = np.zeros((1, Nr))
r_min = 0.5
for k in range(0, Nr):
r_k[0, k] = k * delta_r + r_min
a = np.zeros((Npy, Npx, Nr))
#FOR LOOP TO BE TIME OPTIMIZED
for i in range(0, np.size(true_col, 0)): #true_col and true_rows has the same size so it doesn't matter
for m in range(0, Npy):
for l in range(0, Npx):
d = math.sqrt(math.pow(
(((true_col[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
l * delta_px - (Npx * delta_px / 2) + x0)),
2) + math.pow(
(((true_rows[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
m * delta_py - (Npy * delta_py / 2) + y0)),
2))
min_idx = np.argmin(np.abs(d - r_k))
rk_hat = r_k[0, min_idx]
if np.abs(d - rk_hat) < rho:
a[m, l, min_idx] = a[m, l, min_idx] + 1
#ANOTHER LOOP TO BE OPTIMIZED
# for m in range(0, Npy):
# for l in range(0, Npx): #ORIGINAL
# for k in range(0, Nr):
# if a[m, l, k] < epsilon * np.max(a):
# a[m, l, k] = 0
a[np.where(a[:, :, :] < epsilon * np.max(a))] = 0 #SUBSTITUTED
a_prime = np.sum(a, axis=2)
acc_x = np.zeros((Npx, 1))
acc_y = np.zeros((Npy, 1))
for l in range(0, Npx):
acc_x[l, 0] = l * delta_px - (Npx * delta_px / 2) + x0
for m in range(0, Npy):
acc_y[m, 0] = m * delta_py - (Npy * delta_py / 2) + y0
prod = 0
for m in range(0, Npy):
for l in range(0, Npx):
prod = prod + (np.array([acc_x[l, 0], acc_y[m, 0]]) * a_prime[m, l])
points = prod / np.sum(a_prime)
基于对an answer的评论:
true_rows = np.random.randint(0,256,10)
true_col = np.random.randint(0,256,10)
简而言之,它扫描先前已通过 Canny Edge 检测处理过的 256x256 图像。 For 循环因此必须扫描结果图像的每个像素,还必须计算 2 个嵌套的 for 循环,这些循环根据 'a' 矩阵的 l 和 m 索引的值执行一些操作。
由于边缘检测返回一个带有 0 和 1(对应于边缘)的图像,并且由于内部操作必须仅对一值点进行,所以我使用了
non_zeros = np.nonzero(edges)
只获取我感兴趣的索引。确实,以前的代码是这样的
for i in range(0, N_im):
for j in range(0, N_im):
if edges[i, j] == 1:
for m in range(0, Npy):
for l in range(0, Npx):
d = math.sqrt(math.pow(
(((i - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
l * delta_px - (Npx * delta_px / 2) + x0)),
2) + math.pow(
(((j - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
m * delta_py - (Npy * delta_py / 2) + y0)),
2))
min_idx = np.argmin(np.abs(d - r_k))
rk_hat = r_k[0, min_idx]
if np.abs(d - rk_hat) < rho:
a[m, l, min_idx] = a[m, l, min_idx] + 1
似乎我设法优化了前两个循环,但我的脚本需要比这更快。 运行大约需要 6~7 分钟,我需要执行它大约 1000 次。你能帮我进一步优化这个脚本的循环吗?谢谢!
答案 0 :(得分:2)
您可以使用 Numba JIT 来加速计算(因为默认的 CPython 解释器 对这种计算非常不利)。此外,您可以重新编写循环,以便代码可以并行运行。
这是结果代码:
import numba as nb
# Assume you work with 64-bits integer,
# feel free to change it to 32-bit integers if this is not the case.
# If you encounter type issue, let Numba choose with: @nb.njit(parallel=True)
# However, note that the first run will be slower if you let Numba choose.
@nb.njit('int64[:,:,::1](bool_[:,:], float64[:,:], int64, int64, int64, int64, float64, float64, float64, float64, float64)', parallel=True)
def fasterImpl(edges, r_k, Npy, Npx, Nr, N_im, delta_px, delta_py, rho, x0, y0):
a = np.zeros((Npy, Npx, Nr), dtype=nb.int64)
# Find all the position where edges[i,j]==1
validEdgePos = np.where(edges == 1)
for m in nb.prange(0, Npy):
for l in range(0, Npx):
# Iterate over the i,j value where edges[i,j]==1
for i, j in zip(validEdgePos[0], validEdgePos[1]):
d = math.sqrt(math.pow(
(((i - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
l * delta_px - (Npx * delta_px / 2) + x0)),
2) + math.pow(
(((j - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
m * delta_py - (Npy * delta_py / 2) + y0)),
2))
min_idx = np.argmin(np.abs(d - r_k))
rk_hat = r_k[0, min_idx]
if np.abs(d - rk_hat) < rho:
a[m, l, min_idx] += 1
return a
在我的机器上,使用您的问题中描述的输入(包括提供的 sf_img
),此代码快 616 倍。
Reference time: 109.680 s
Optimized time: 0.178 s
请注意,结果与参考实现完全相同。
答案 1 :(得分:0)
根据您的脚本,您一般对 numpy 几乎没有经验。 Numpy 使用 SIMD 指令进行了优化,您的代码有点击败了它。我建议您查看有关如何编写 numpy 代码的基础知识
请查看此备忘单。 https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf
例如,此代码可以更改为
r_k = np.zeros((1, Nr))
for k in range(0, Nr):
r_k[0, k] = k * delta_r + r_min
### to a simple np.arange assignment
r_k = np.zeros((1, Nr))
r_k[0,:] = np.arange(Nr) * delta_r + r_min
### or you can do everything in one line
r_k = np.expand_dims(np.arange(Nr) * delta_r + r_min,axis=0)
这段代码有点笨拙,因为您在循环遍历每个元素时创建了一个 np.array。您也可以简化此代码。您是否将数据类型从 int 更改为两个的 np.array?
prod = 0
for m in range(0, Npy):
for l in range(0, Npx):
prod = prod + (np.array([acc_x[l, 0], acc_y[m, 0]]) * a_prime[m, l])
对于这个循环,你可以慢慢地分离出依赖元素和独立元素。
#FOR LOOP TO BE TIME OPTIMIZED
for i in range(0, np.size(true_col, 0)): #true_col and true_rows has the same size so it doesn't matter
for m in range(0, Npy):
for l in range(0, Npx):
d = math.sqrt(math.pow(
(((true_col[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
l * delta_px - (Npx * delta_px / 2) + x0)),
2) + math.pow(
(((true_rows[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
m * delta_py - (Npy * delta_py / 2) + y0)),
2))
min_idx = np.argmin(np.abs(d - r_k))
rk_hat = r_k[0, min_idx]
if np.abs(d - rk_hat) < rho:
a[m, l, min_idx] = a[m, l, min_idx] + 1
外循环for i in range(0, np.size(true_col, 0))
没问题
你不需要循环来计算这个。对于索引乘法,您可以分配一个额外的矩阵数组,以便获得所需的 1:1 格式。
for m in range(0, Npy):
for l in range(0, Npx):
d = math.sqrt(math.pow(
(((true_col[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
l * delta_px - (Npx * delta_px / 2) + x0)),
2) + math.pow(
(((true_rows[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (
m * delta_py - (Npy * delta_py / 2) + y0)),
2))
要模拟 m 和 l 行为,您可以通过 Npy 索引矩阵创建 Npx。尽管这种模式看起来很奇怪,但 Numpy 继承了 MATLAB 生态系统的技巧,因为 MATLAB/numpy 的目标是简化代码并让您花更多时间修复逻辑。
## l matrix
[[0,1,2,3,4,5,6,7,8....Npx],
[0,1,2,3,4,5,6,7,8....Npx],
.....
[0,1,2,3,4,5,6,7,8....Npx]]
##m matrix
[[0,0,0,0,0,0,0,0,0,0,0,0],
[1,1,1,1,,1,1,1,1,1,1,1,1],
.....
[Npx,Npx,Npx.....,Npx]]
## You can create both with one command
l_mat, m_mat = np.meshgrid(np.arange(Npx), np.arange(Npy))
>>> l_mat
array([[ 0, 1, 2, ..., 147, 148, 149],
[ 0, 1, 2, ..., 147, 148, 149],
[ 0, 1, 2, ..., 147, 148, 149],
...,
[ 0, 1, 2, ..., 147, 148, 149],
[ 0, 1, 2, ..., 147, 148, 149],
[ 0, 1, 2, ..., 147, 148, 149]])
>>> m_mat
array([[ 0, 0, 0, ..., 0, 0, 0],
[ 1, 1, 1, ..., 1, 1, 1],
[ 2, 2, 2, ..., 2, 2, 2],
...,
[97, 97, 97, ..., 97, 97, 97],
[98, 98, 98, ..., 98, 98, 98],
[99, 99, 99, ..., 99, 99, 99]])
使用这两个矩阵,您可以将其相乘以创建结果。
d = np.sqrt(np.pow( true_col[i] - np.floor((N_im + 1)/2)) / (N_im + l_mat).....
对于这两行代码,您似乎正在设置一个argmin矩阵。
min_idx = np.argmin(np.abs(d - r_k))
rk_hat = r_k[0, min_idx]
https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html
vfunc = np.vectorize(lambda x: np.argmin(np.abs(x - r_k))
min_idx = vfunc(d)
vfunc2 = np.vectorize(lambda x: r_k[0, x])
rk_hat = vfunc2(min_idx)
对于最后两行,d 和 rk_hat 应该是 Npx 乘 Npy 矩阵。您可以使用矩阵切片或 np.where 来创建矩阵掩码。
if np.abs(d - rk_hat) < rho:
points = np.where( np.abs(d-rk_hat) < rho )
https://numpy.org/doc/stable/reference/generated/numpy.where.html
我放弃了最后一行,如果你把它放在一个循环中可能没有关系
a[m, l, min_idx] = a[m, l, min_idx] + 1
for xy in points:
a[xy[0],xy[1], min_idx[xy[0],xy[1]]] += 1
答案 2 :(得分:0)
优化嵌套循环的新答案,
....
for i in range(0, np.size(true_col, 0)): #true_col and true_rows has the same size so it doesn't matter
for m in range(0, Npy):
for l in range(0, Npx):
处理时间有显着改善。对于长度为 2500 的 true_col
和 true_rows
,在我的机器上大约需要 3 秒。它位于一个用于测试目的的函数中。
def new():
a = np.zeros((Npy, Npx, Nr),dtype=int)
# tease out and separate some of the terms
# used in the calculation of the distance - d
bb = N_im + 1
cc = (Npx * delta_px / 2)
dd = (Npy * delta_py / 2)
l, m = np.meshgrid(np.arange(Npx), np.arange(Npy))
q = (true_col - math.floor(bb / 2)) / bb / 2 # shape (true_col length,)
r = l * delta_px - cc + x0 # shape(Npy,Npx)
s = np.square(q - r[...,None]) # shape(Npy,Npx,true_col length)
# - last dimension is the outer loop of the original
t = (true_rows - math.floor(bb / 2)) / bb / 2 # shape (len(true_rows),)
u = m * delta_py - dd + y0 # shape(60,129) ... (Npx,Npy)
v = np.square(t - u[...,None]) # shape(Npy,Npx,true_col length)
d = np.sqrt(s + v) # shape(Npy,Npx,true_col length)
e1 = np.abs(d[...,None] - r_k.squeeze()) # shape(Npy,Npx,true_col length,len(r_k[0,:]))
min_idx = np.argmin(e1,-1) # shape(Npy,Npx,true_col length)
rk_hat = r_k[0,min_idx] # shape(Npy,Npx,true_col length)
zz = np.abs(d-rk_hat) # shape(Npy,Npx,true_col length)
condition = zz < rho # shape(Npy,Npx,true_col length)
# seemingly unavoidable for loop needed to perform
# a bincount along the last dimension (filtered)
# while retaining the 2d position info
# this will be pretty fast though,
# nothing really going on other than indexing and assignment
for iii in range(Npy*Npx):
row,col = divmod(iii,Npx)
filter = condition[row,col]
one_d = min_idx[row,col]
counts = np.bincount(one_d[filter])
a[row,col,:counts.size] = counts
return a
我不知道如何使用 Numpy 方法来摆脱过滤小于 rho
并执行 bincount
的最终循环 - 如果我弄清楚了,我会更新>
来自您的问题和评论的数据
import math
import numpy as np
np.random.seed(5)
n_ = 2500
true_col = np.random.randint(0,256,n_)
true_rows = np.random.randint(0,256,n_)
N_im = 256
x0 = 0
y0 = -0.25
Npx = 129
Npy = 60
# Npx = 8
# Npy = 4
delta_py = 0.025
delta_px = 0.031
Nr = 9
delta_r = 0.5
rho = 0.063
epsilon = 0.75
r_min = 0.5
r_k = np.arange(Nr) * delta_r + r_min
r_k = r_k.reshape(1,Nr)
函数中的原始嵌套循环 - 添加了一些诊断功能。
def original(writer=None):
'''writer should be a csv.Writer object.'''
a = np.zeros((Npy, Npx, Nr),dtype=int)
for i in range(0, np.size(true_col, 0)): #true_col and true_rows has the same size so it doesn't matter
for m in range(0, Npy):
for l in range(0, Npx):
d = math.sqrt(math.pow((((true_col[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (l * delta_px - (Npx * delta_px / 2) + x0)),2) +
math.pow((((true_rows[i] - math.floor((N_im + 1) / 2)) / (N_im + 1) / 2) - (m * delta_py - (Npy * delta_py / 2) + y0)),2))
min_idx = np.argmin(np.abs(d - r_k)) # scalar
rk_hat = r_k[0, min_idx] # scalar
if np.abs(d - rk_hat) < rho:
# if (m,l) == (0,0):
if writer:
writer.writerow([i,m,l,d,min_idx,rk_hat,a[m, l, min_idx] + 1])
# print(f'condition satisfied: i:{i} a[{m},{l},{min_idx}] = {a[m, l, min_idx]} + 1')
a[m, l, min_idx] = a[m, l, min_idx] + 1
return a