CUDA上的线路检测

时间:2013-07-02 19:16:15

标签: cuda hough-transform

我正在尝试使用CUDA进行实时线路检测。我已经计算了霍夫变换以及每个仓的最小,最大线坐标。为了获得我正在跟踪的线段(使用Bresenham的线算法),通过最小值到最大点并获得每个箱子上的线段。当霍夫阈值很低并且图像中有很多行时,trace_lines需要很长时间才能完成。

hough变换(hough_line_transform)计算在GTX 660上每帧大约需要5-10ms(1280x720)(观察到比CPU实现快10倍)。但是从最小值跟踪线段,最大点需要1ms-15ms。

我对线路检测有两个问题

  1. 是否有更好的算法从霍夫箱的最小点,最大点获取线段?

  2. 是否可以进一步优化hought_line_transform(请参阅下面的代码)?我正在使用原子操作。是否有可能避免原子。

  3. 我附上以下代码。

    班级标题

    #ifndef _HOUGH_LINES_H_
    #define _HOUGH_LINES_H_
    
    #include <cuda_gl_interop.h>
    #include <thrust/device_vector.h>
    
    union Pos;
    struct Line;
    
    struct Hough_params
    {
        int w;
        int h;
        int r;
    };
    
    class Hough_lines
    {
    public:
        enum Type {INT, SHORT_INT, FLOAT};
    
        Hough_lines(int _w, int _h);
        ~Hough_lines();
    
    public:
        bool init();
        bool detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Type type, int& count);
    
    protected:
        void get_edges(thrust::device_vector<Pos>& d_coords, int& size);
        void get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size);
        void get_lines(int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count);
        void trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count);
    
        static void compute_trig_funcs();
    
    protected:
        Hough_params params;
        thrust::device_vector<Hough_params> d_param;
    
        static bool trig_init;
    };
    
    #endif
    

    #include <hough_lines.h>
    
    #include <math.h>
    #include <stdio.h>
    
    #include <cuda.h>
    #include <cuda_runtime_api.h>
    
    #include <cuda_gl_interop.h>
    
    #include <thrust/host_vector.h>
    
    #include <thrust/copy.h>
    #include <thrust/scan.h>
    
    #define ANGLE_SIZE 360
    #define MAX_LINE_PER_THREAD 10
    
    union Pos
    {
        struct
        {
            uint16_t x;
            uint16_t y;
        };
        uint32_t value;
    };
    
    struct Hough_info
    {
        Pos end;
        Pos start;
        int count;
    };
    
    struct Line
    {
        Pos start;
        Pos end;
    };
    
    struct Line_info
    {
        int line_count;
        Line line[MAX_LINE_PER_THREAD];
    };
    
    __constant__ float dev_sint[ANGLE_SIZE];
    __constant__ float dev_cost[ANGLE_SIZE];
    
    texture<uint8_t, 2, cudaReadModeElementType> luma_tex;
    
    bool Hough_lines::trig_init = false;
    
    __global__ void mark_edges(const Hough_params* param, int* edge)
    {
        int x = (blockIdx.x*blockDim.x+threadIdx.x);
        int y = (blockIdx.y*blockDim.y+threadIdx.y);
        int pos = x+(param->w*y);   
        edge[pos] = (255 == tex2D(luma_tex, x, y))?1:0;
    }
    
    __global__ void get_coords(const Hough_params* param, int* edge, Pos* coord)
    {
        int index;
        int x = (blockIdx.x*blockDim.x+threadIdx.x);
        int y = (blockIdx.y*blockDim.y+threadIdx.y);
        int pos = x+(param->w*y);   
        if (255 == tex2D(luma_tex, x, y))
        {
            index = edge[pos];
            coord[index].y = y;
            coord[index].x = x;
        }
    }
    
    __global__ void hough_line_transform(const Hough_params* param, int size, const Pos* coord, int threshold, int *mark, Hough_info* out)
    {
        int i;
        int angle;
        int rdata;
        __shared__ Hough_info sh_rho_data[1001];
    
        i = threadIdx.x;
        while (i < param->r)
        {
            sh_rho_data[i].end.value = 0x0;
            sh_rho_data[i].start.value = 0xFFFFFFFF;
            sh_rho_data[i].count = 0;
    
            i += blockDim.x;
        }
        __syncthreads();
    
        i = threadIdx.x;
        angle = blockIdx.x;
        const float cos_angle = dev_cost[angle];
        const float sin_angle = dev_sint[angle];
        while (i < size)
        {
            rdata = (int)ceil(((float)(coord[i].x-(param->w>>1))*cos_angle)+((float)((param->h>>1)-coord[i].y)*sin_angle));
            if (rdata >= 0)
            {
                atomicMax(&sh_rho_data[rdata].end.value, coord[i].value);
                atomicMin(&sh_rho_data[rdata].start.value, coord[i].value);
    
                atomicAdd(&sh_rho_data[rdata].count, 1);
            }
            i += blockDim.x;
        }
    
        __syncthreads();
    
        i = threadIdx.x;
        rdata = (angle*param->r);
        while (i < param->r)
        {
            memcpy(&out[rdata+i], &sh_rho_data[i], sizeof(Hough_info));
            mark[rdata+i] = (sh_rho_data[i].count >= threshold)?1:0;
            i += blockDim.x;
        }
    }
    
    __global__ void get_lines(const Hough_params* param, int threshold, Hough_info* hdata, int* mark, Line* lines)
    {
        int pos;
        int i = threadIdx.x;
        int offset = (blockIdx.x*param->r);
        while (i < param->r)
        {
            if (hdata[offset+i].count >= threshold)
            {
                pos = mark[offset+i];
                lines[pos].start.value = hdata[offset+i].start.value;
                lines[pos].end.value = hdata[offset+i].end.value;
            }
            i += blockDim.x;
        }
    }
    
    __device__ void add_line(int xs, int ys, int xe, int ye, int min_len, Line_info* line)
    {
        int d = abs(xe-xs)+abs(ye-ys);
        if ((d >= min_len) && (line->line_count < MAX_LINE_PER_THREAD))
        {
            line->line[line->line_count].start.x = xs;
            line->line[line->line_count].start.y = ys;
            line->line[line->line_count].end.x = xe;
            line->line[line->line_count].end.y = ye;
    
            ++line->line_count;
    
            //printf("\n(%d %d) (%d %d) %d", xs, ys, xe, ye, d);
        }
    } 
    
    __global__ void trace_lines(const Line* input, int inp_size, int min_len, int min_gap, Line_info* line_info, int* mark)
    {
        int d;
        int dsub;
        int dstep;
        int xstep;
        int ystep;
    
        int xs, ys, xe, ye;
        int i = (blockIdx.x*blockDim.x+threadIdx.x);
        if (i >= inp_size)
        {
            return;
        }
    
        xs = input[i].start.x;
        ys = input[i].start.y;
        xe = input[i].end.x;
        ye = input[i].end.y;
    
        line_info[i].line_count = 0;
    
        int dx = abs(xe-xs);
        int dy = abs(ye-ys);
        int xinc = (xe > xs)?1:-1;
        int yinc = (ye > ys)?1:-1;
    
        int gap = 0;
        bool sflag;
        int s_x, s_y, e_x, e_y;
    
        if (dx > dy)
        {
            dsub = (dx<<1);
            dstep = (dy<<1);
            d = dstep-dx;
            xstep = xinc;
            ystep = 0;
            xinc = 0;
        }
        else
        {
            dsub = (dy<<1);
            dstep = (dx<<1);
            d = dstep-dy;
            xstep = 0;
            ystep = yinc;
            yinc = 0;
        }
    
        sflag = true;
        s_x = xs;
        s_y = ys;
        e_x = xs;
        e_y = ys;
    
        int x = xs;
        int y = ys;
    
        while ((abs(x-xs) <= dx) && (abs(y-ys) <= dy))
        {
            x += xstep;
            y += ystep;
            if (d > 0)
            {
                x += xinc;
                y += yinc;
    
                d -= dsub;
            }
            d += dstep;
    
            if (255 == tex2D(luma_tex, x, y))
            {
                e_x = x;
                e_y = y;
                gap = 0;
    
                if (!sflag)
                {
                    s_x = x;
                    s_y = y;
                    sflag = true;
                }
            }
            else if (sflag)
            {
                ++gap;
                if (gap >= min_gap)
                {
                    sflag = false;
                    add_line(s_x, s_y, e_x, e_y, min_len, &line_info[i]);
                }
            }
        }
    
        if (sflag)
        {
            add_line(s_x, s_y, xe, ye, min_len, &line_info[i]);
        }
        mark[i] = line_info[i].line_count;
    }
    
    __global__ void copy_line_coords(const Hough_params* param, Line_info* line, int size, int* mark, int* coords, int* count)
    {
        int index = (blockIdx.x*blockDim.x+threadIdx.x);
        if (index >= size)
        {
            return;
        }
    
        int pos;
        int start = 4*mark[index];
        Line* line_data = &line[index].line[0];
        for (int i = 0; i < line[index].line_count; i++)
        {
            pos = start+(4*i);
            coords[pos] = line_data[i].start.x-(param->w>>1);
            coords[pos+1] = (param->h>>1)-line_data[i].start.y;
            coords[pos+2] = line_data[i].end.x-(param->w>>1);
            coords[pos+3] = (param->h>>1)-line_data[i].end.y;
        }
    
        if ((index+1) == size)
        {
            *count = mark[index];
        }
    }
    
    Hough_lines::Hough_lines(int _w, int _h)
        :d_param(1)
    {
        params.w = _w;
        params.h = _h;
        params.r = (int)ceil(0.5*sqrt((_w*_w)+(_h*_h)));
        thrust::copy_n(&params, 1, d_param.begin());
    }
    
    Hough_lines::~Hough_lines()
    {
    }
    
    bool Hough_lines::init()
    {
        if (false == trig_init)
        {
            trig_init = true;
            compute_trig_funcs();
        }
        return true;
    }
    
    void Hough_lines::compute_trig_funcs()
    {
        float theta;
        cudaError_t err = cudaSuccess;
        static float sint[ANGLE_SIZE];
        static float cost[ANGLE_SIZE];
    
        for (int i = 0; i < ANGLE_SIZE; i++)
        {
            theta = (M_PI*(float)i)/180.0;
            sint[i] = sin(theta);
            cost[i] = cos(theta);
        }
    
        err = cudaMemcpyToSymbol(dev_sint, sint, ANGLE_SIZE*sizeof(float));
        err = (cudaSuccess == err) ? cudaMemcpyToSymbol(dev_cost, cost, ANGLE_SIZE*sizeof(float)):err;
        if (cudaSuccess != err)
        {
            printf("\n%s", cudaGetErrorString(cudaGetLastError()));
        } 
    }
    
    void Hough_lines::get_edges(thrust::device_vector<Pos>& d_coords, int& size)
    {
        dim3 bsize(16, 16);
        dim3 gsize(params.w/bsize.x, params.h/bsize.y);
        thrust::device_vector<int> d_mark(params.w*params.h);
    
        size = 0;
        mark_edges<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
            thrust::raw_pointer_cast(d_mark.data()));
        thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
        get_coords<<<gsize, bsize>>>(thrust::raw_pointer_cast(d_param.data()),
            thrust::raw_pointer_cast(d_mark.data()),
            thrust::raw_pointer_cast(d_coords.data()));
        thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
    }
    
    void Hough_lines::get_hough_lines(int threshold, thrust::device_vector<Line>& d_lines, int& size)
    {
        int edge_count = 0;
        thrust::device_vector<Pos> d_coords(params.w*params.h); 
        get_edges(d_coords, edge_count);
    
        thrust::device_vector<int> d_mark(params.r*360);
        thrust::device_vector<Hough_info> d_hough_data(params.r*360);
        hough_line_transform<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
            edge_count,
            thrust::raw_pointer_cast(d_coords.data()), threshold,
            thrust::raw_pointer_cast(d_mark.data()), 
            thrust::raw_pointer_cast(d_hough_data.data())); 
        thrust::exclusive_scan(d_mark.begin(), d_mark.end(), d_mark.begin());
        ::get_lines<<<360, 256>>>(thrust::raw_pointer_cast(d_param.data()),
            threshold,
            thrust::raw_pointer_cast(d_hough_data.data()),
            thrust::raw_pointer_cast(d_mark.data()),
            thrust::raw_pointer_cast(d_lines.data()));
        thrust::copy_n(d_mark.begin()+d_mark.size()-1, 1, &size);
    }
    
    void Hough_lines::trace_all_lines(int min_len, int min_gap, thrust::device_vector<Line>& d_lines, int size, int* d_line_coord, int& count)
    {
        thrust::device_vector<int> d_mark_line(size);
        thrust::device_vector<Line_info> d_nlines(size);
    
        trace_lines<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_lines.data()), 
            size, min_len, min_gap, thrust::raw_pointer_cast(d_nlines.data()),
            thrust::raw_pointer_cast(d_mark_line.data()));
    
        thrust::exclusive_scan(d_mark_line.begin(), d_mark_line.end(), d_mark_line.begin());
    
        thrust::device_vector<int> d_count(1);
        copy_line_coords<<<(1+(size/512)), 512>>>(thrust::raw_pointer_cast(d_param.data()),
            thrust::raw_pointer_cast(d_nlines.data()), size,
            thrust::raw_pointer_cast(d_mark_line.data()), d_line_coord,
            thrust::raw_pointer_cast(d_count.data()));
    
        thrust::copy(d_count.begin(), d_count.end(), &count);
        //printf("\nLine count: %d", count);
    }
    
    void Hough_lines::get_lines(int threshold, int min_len, int min_gap, GLuint line, Hough_lines::Type type, int& count)
    {
        int* d_line_coord = 0;
        cudaGLRegisterBufferObject(line);
        cudaGLMapBufferObject((void **)&d_line_coord, line);
    
        int size = 0;
        thrust::device_vector<Line> d_lines(params.r*360); 
        get_hough_lines(threshold, d_lines, size);
        //printf("\nget_hough_lines: %d", size);
    
        trace_all_lines(min_len, min_gap, d_lines, size, d_line_coord, count);
    
        cudaGLUnmapBufferObject(line);
        cudaGLUnregisterBufferObject(line);
        }
    
        bool Hough_lines::detect_lines(GLuint tex_edge, int threshold, int min_length, int min_gap, GLuint line, Hough_lines::Type type, int& count)
        {
        cudaError_t err;
        cudaArray* array_edge;
        cudaGraphicsResource* res_edge;
    
        err = cudaGraphicsGLRegisterImage(&res_edge, tex_edge, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly);
        if (err != cudaSuccess)
        {
            printf("cudaGraphicsGLRegisterImage Failed: %s", cudaGetErrorString(cudaGetLastError()));
            exit(0);
        }
    
        cudaGraphicsMapResources(1, &res_edge);
        cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<uint8_t>();
        err = cudaGraphicsSubResourceGetMappedArray(&array_edge, res_edge, 0, 0);
        if (err != cudaSuccess)
        {
            printf("cudaGraphicsSubResourceGetMappedArray Failed: %s", cudaGetErrorString(cudaGetLastError()));
            exit(0);
        }
    
        if (cudaBindTextureToArray(&luma_tex, array_edge, &chan_desc) != cudaSuccess)
        {
            printf("Failed to bind texture - %s\n", cudaGetErrorString(cudaGetLastError()));
            exit(0);
        }
    
        float time = 0.0;
        //static float max = 0.0;
        cudaEvent_t start, stop;
    
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start);
    
        count = 0;
        get_lines(threshold, min_length, min_gap, line, type, count);
    
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
    
        //static int frame = 0;
        //frame++;
        //if (time > max)
        {
            //max = time;
            printf("\nElpased time: %f ms", time);
        }
    
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    
        cudaUnbindTexture(luma_tex);   
        cudaGraphicsUnmapResources(1, &res_edge);
        cudaGraphicsUnregisterResource(res_edge);
    
        return true;
    }
    

1 个答案:

答案 0 :(得分:2)

"Prefix sums and their applications"中,Guy Blelloch描述了使用并行前缀和的并行线绘制算法。参见该论文的第55页,它可能会给你一些想法。

关于如何优化hough_line_transfer,我认为关键是消除循环中的共享内存原子。在您使用它们的地方,您实际上在进行键控缩减。 Thrust提供reduce_by_key函数,但只能从主机调用。与Thrust相对应的设备库是CUB,但它没有reduce_by_key。我已经向CUB作者询问了这里的想法,如果我们提出任何建议,我会更新这个答案。

您可以编写自己的键控缩减,但如果可能的话,依靠库可以提高效率和稳健性。