Question

在我目前的项目中，我需要找到包含在另一个较大尺寸图像中的图像的像素精确位置。较小的图像永远不会旋转或拉伸（因此应逐个像素地匹配），但它可能具有不同的亮度，并且图像中的某些像素可能会失真。我的第一个尝试是在CPU上做，但它太慢了。计算非常平行，所以我决定使用GPU。我刚开始学习CUDA并编写了我的第一个CUDA应用程序。我的代码有效，但即使在GPU上它仍然太慢。当较大的图像尺寸为1024x1280且较小的尺寸为128x128时，程序在GeForce GTX 560 ti上执行2000ms的计算。我需要在不到200ms的时间内获得结果。在未来，我可能需要一个更复杂的算法，所以我宁愿拥有更多的计算能量储备。问题是如何优化我的代码以实现加速？

CUDAImageLib.dll：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <cutil.h>

//#define SUPPORT_ALPHA

__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
    unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
    float diff = 0;
    for (unsigned int y = 0; y < sh; ++y)
    {
        for (unsigned int x = 0; x < sw; ++x)
        {
            unsigned int as = (x + y * sw) * 4;
            unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
            diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
            diff += abs(BufferS[as] - BufferB[ab]);
            diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
            diff += abs(BufferS[as + 2] - BufferB[ab + 2]);     
#endif
        }
    }
    BufferOut[bx + (by * (bw - sw))] = diff;
}

extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
    int aBytes = (bw * bh) * 4 * sizeof(float);
    int bBytes = (sw * sh) * 4 * sizeof(float);
    int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);

    dim3 threadsPerBlock(32, 32);
    dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);

    float *dev_B = 0;
    float *dev_S = 0;
    float *dev_Out = 0;

    unsigned int timer = 0;
    float sExecutionTime = 0;

    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cutCreateTimer(&timer);
    cutStartTimer(timer);

    // Launch a kernel on the GPU with one thread for each element.
    ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    cutStopTimer(timer);
    sExecutionTime = cutGetTimerValue(timer);

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_Out);
    cudaFree(dev_B);
    cudaFree(dev_S);
    return (int)sExecutionTime;
}

extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
    int minIndex = 0;
    float minValue = 3.4e+38F;
    for (int i = 0; i < count; ++i)
    {
        if (values[i] < minValue)
        {
            minValue = values[i];
            minIndex = i;
        }
    }
    return minIndex;
}

C＃test app：

using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;

namespace TestCUDAImageSearch
{
    class Program
    {
        static void Main(string[] args)
        {
            using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
            {
                Console.WriteLine("Big " + big.Width + "x" + big.Height + "    Small " + small.Width + "x" + small.Height);

                Stopwatch sw = new Stopwatch();
                sw.Start();
                Point point = CUDAImageLIb.ImageSearch(big, small);
                sw.Stop();
                long t = sw.ElapsedMilliseconds;
                Console.WriteLine("Image found at " + point.X + "x" + point.Y);
                Console.WriteLine("total time=" + t + "ms     kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
            }
            Console.WriteLine("Hit key");
            Console.ReadKey();
        }
    }
}



//#define SUPPORT_HSB

using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;

namespace TestCUDAImageSearch
{
    public static class CUDAImageLIb
    {
        [DllImport("CUDAImageLib.dll")]
        private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);

        [DllImport("CUDAImageLib.dll")]
        private static extern int FindMinCPU(float[] values, int count);

        private static int _lastKernelTime = 0;

        public static int LastKernelTime
        {
            get { return _lastKernelTime; }
        }

        public static Point ImageSearch(Bitmap big, Bitmap small)
        {
            int bw = big.Width;
            int bh = big.Height;
            int sw = small.Width;
            int sh = small.Height;
            int mx = (bw - sw);
            int my = (bh - sh);

            float[] diffs = new float[mx * my];
            float[] b = ImageToFloat(big);
            float[] s = ImageToFloat(small);
            _lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
            int minIndex = FindMinCPU(diffs, diffs.Length);
            return new Point(minIndex % mx, minIndex / mx);
        }

        public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
        {
            int bw = big.Width;
            int bh = big.Height;
            int sw = small.Width;
            int sh = small.Height;
            int mx = (bw - sw);
            int my = (bh - sh);
            int nDiff = mx * my;

            float[] diffs = new float[nDiff];
            float[] b = ImageToFloat(big);
            float[] s = ImageToFloat(small);
            _lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);

            List<Point> points = new List<Point>();
            for(int i = 0; i < nDiff; ++i)
            {
                if (diffs[i] < maxDeviation)
                {
                    points.Add(new Point(i % mx, i / mx));
                }
            }
            return points;
        }

#if SUPPORT_HSB

        private static float[] ImageToFloat(Bitmap img)
        {
            int w = img.Width;
            int h = img.Height;
            float[] pix = new float[w * h * 4];
            int i = 0;
            for (int y = 0; y < h; ++y)
            {
                for (int x = 0; x < w; ++x)
                {
                    Color c = img.GetPixel(x, y);
                    pix[i] = c.GetHue() / 360;                   
                    pix[i + 1] = c.GetSaturation();                
                    pix[i + 2] = c.GetBrightness();                    
                    pix[i + 3] = c.A;
                    i += 4;
                }
            }
            return pix;
        }
#else
        private static float[] ImageToFloat(Bitmap bmp)
        {
            int w = bmp.Width;
            int h = bmp.Height;
            int n = w * h;
            float[] pix = new float[n * 4];

            System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
            Rectangle r = new Rectangle(0, 0, w, h);
            BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
            System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
            int[] pixels = new int[n];
            System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
            bmp.UnlockBits(bmpData);

            int j = 0;
            for (int i = 0; i < n; ++i)
            {
                pix[j] = (pixels[i] & 255)  / 255.0f;
                pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
                pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
                pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
                j += 4;
            }
            return pix;
        }
#endif
    }
}

Answer 1

看起来你所说的是一个众所周知的问题：Template matching。最简单的方法是将图像（较大的图像）与模板（较小的图像）进行卷积。您可以通过两种方式之一实现卷积。

1）修改CUDA SDK中的卷积示例（类似于您正在进行的操作）。

2）使用FFT实现卷积。参考。 Convolution theorem。你需要记住

% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));

您可以使用cufft来实现2维FFT（适当填充它们之后）。在执行逆FFT之前，您需要编写一个执行元素乘法的内核，然后对结果进行归一化（因为CUFFT不规范化）。

对于您提到的尺寸（1024 x 1280和128 x 128），输入必须填充到至少（（1024 + 128 - 1）x（1280 + 128 -1）= 1151 x 1407）。但是当（填充）输入为2的幂时，FFT最快。因此，您需要将大图像和小图像填充到2048 x 2048的大小。

Answer 2

您可以使用更快的内存访问来加快计算速度，例如使用

大图像的纹理缓存
小图像或部分图像的共享内存或常量缓存。

但你真正的问题是比较的整个方法。在每个可能的位置逐个像素地比较图像将永远不会有效。有太多的工作要做。首先，你应该考虑找到

的方法

在大图像中选择可能包含小图像的有趣图像区域，并仅搜索这些
通过表示非像素值的图像来找到更快的比较机制。您应该能够通过计算具有较少数据的表示来比较图像，例如，颜色直方图或积分图像。

如何使用CUDA在另一个图像中快速查找图像？

2 个答案: