我已经在python中创建了一个双边过滤器,并添加了numba(理论上)在我的GPU上运行它。但是,当我将模式设置为CUDA时,它的速度非常慢,并且在并行使用时似乎根本不使用我的显卡,因此速度非常快。我最好的理由是我的函数并未针对在GPU上很好地运行而进行优化,因为它无法一次将一个函数应用于所有元素(因为它需要x和y坐标才能起作用)。所以我的问题是,为什么在目标设置为CUDA的情况下运行缓慢?
@guvectorize([(float64[:,:], float64[:,:])],'(n,m)->(n,m)',target='cuda',nopython =True)
def apply_filter(img, filteredImage):
imh, imw = img.shape[:2]
radius = diameter // 2
hd = int((diameter - 1) / 2)
#print("starting work in cuda")
for h in prange(hd, imh - hd):
for w in prange(hd, imw - hd):
Wp = 0
filteredPixel = 0
startX = h
startY = w
for x in prange(0, diameter):
for y in prange(0, diameter):
currentX = startX - (radius - x)
cureentY = startY - (radius - y)
intensityDifferent = img[currentX][cureentY] - img[startX][startY]
intensity = (1.0 / (2 * math.pi * (sIntesity ** 2)) * math.exp(- (intensityDifferent ** 2) / (2 * sIntesity ** 2)))
distance = ((currentX-startX)**2 + (cureentY-startY)**2)**(1/2)
smoothing = (1.0 / (2 * math.pi * (sSpace ** 2))) * math.exp(- (distance ** 2) / (2 * sSpace ** 2))
weight = intensity * smoothing;
filteredPixel += img[currentX][cureentY] * weight
Wp += weight
filteredImage[h][w] = int(round(filteredPixel / Wp))
#print("done!")