Question

我在立体视觉上工作，我有一个推力库的问题，当我在我的内核函数中使用它时，应用程序运行和bug，因为所有内核都没有在我的＆lt;＆lt;＆lt;＆lt; ＆GT;＆GT;＆GT;打电话，但当我删除排序所有工作（但结果不好）。我已经寻找替代方案，但除了针对gpu排序的推力之外我什么都没发现。感谢

#include <opencv2/highgui/highgui.hpp>
#include <opencv2/core/core.hpp>
#include <iostream>
#include <time.h>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/core/cuda.hpp>
#include <algorithm>
#include <functional>
#include <array>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <opencv2/imgproc/imgproc.hpp>

using namespace std;
using namespace cv;

const int correlationWindow = 81;
const int widthWindow = (int)sqrt((float)correlationWindow);
const int searchWindow = 52;

__constant__ int widthWindow2 = 9;

__device__
void makeVector(float *mat, float *vec, int col, int x, int y) {
int ind = 0;
for (int i = x; i < x + widthWindow2; i++) {
    for (int j = y; j < y + widthWindow2; j++) {
        vec[ind] = mat[col * i + j];
        ind++;
    }
}
}

__device__
void disparityUpdate2(int i, int j, int col, int distance, float *d_disparity) {
d_disparity[col * i + j] = ((255 / searchWindow) * distance);
}

void resize(float *d_disparity, Mat &disparity) {
for (int i = 0; i < disparity.rows; i++) {
    for (int j = 0; j < disparity.cols; j++) {
        int ind = disparity.cols * i + j;
        disparity.at<float>(i, j) = d_disparity[ind];
    }
}
}

__global__
void computeSMAD2(int minX, float *d_mL, float *d_mR, float *dif, float *windowL, float *windowR, float  *d_disparity, int colmLO, int colmL, int seachWindow) {
int mini;

int pOiX(threadIdx.x + minX); // + minX
int pOiY(blockIdx.x + minX);
int newPoIx(max(minX, pOiX - searchWindow));
int newPoIy(pOiY); 
/*int pOiX(pox);
int pOiY(poy);
int newPoIx(npox);
int newPoIy(npoy); */
int minPoIx(newPoIx);
int smad = 0;
int bMax = (int)(correlationWindow / 2);

makeVector(d_mL, windowL, colmL, pOiY, pOiX); // ATTENTION ligne / colonne
makeVector(d_mR, windowR, colmL, newPoIy, newPoIx);

for (int h = 0; h < correlationWindow; h++) {
    dif[h] = windowL[h] - windowR[h];
}

thrust::sort(thrust::seq, dif, dif + correlationWindow);

int median = dif[(correlationWindow - 1) / 2];

for (int h = 0; h < correlationWindow; h++) {
    dif[h] = abs(dif[h] - median);
}

thrust::sort(thrust::seq, dif, dif + correlationWindow);

for (int i = 0; i <= bMax - 1; i++) {
    smad = smad + pow(dif[i], 2);
}

mini = smad;

newPoIx++;

for (int i = newPoIx; i <= pOiX; i++) {
    smad = 0;
    makeVector(d_mR, windowR, colmL, newPoIy, i);

    for (int h = 0; h < correlationWindow; h++) {
        dif[h] = windowL[h] - windowR[h];
    }

    thrust::sort(thrust::seq, dif, dif + correlationWindow);

    median = dif[(correlationWindow - 1) / 2];

    for (int h = 0; h < correlationWindow; h++) {
        dif[h] = abs(dif[h] - median);
    }

    thrust::sort(thrust::seq, dif, dif + correlationWindow);

    for (int j = 0; j <= bMax - 1; j++) {
        if (smad < mini) {
            smad = smad + pow(dif[j], 2);
        }
        else {
            break;
        }
    }

    if (smad < mini) {
        mini = smad;
        minPoIx = i;
    }
}
int distance = pOiX - minPoIx;

d_disparity[colmLO * (pOiY - minX) + (pOiX - minX)] = ((255 / searchWindow) * distance);
}

Mat SMAD2(int minX, Mat mLO, Mat mRO) {
Mat mL = Mat::zeros(mLO.rows + 2 * minX, mLO.cols + 2 * minX, CV_32FC1);
Mat mR = Mat::zeros(mLO.rows + 2 * minX, mLO.cols + 2 * minX, CV_32FC1);
Mat disparity = Mat::zeros(mRO.rows, mRO.cols, CV_32FC1);

mLO.copyTo(mL.rowRange(minX, mL.rows - minX).colRange(minX, mL.cols - minX));
mRO.copyTo(mR.rowRange(minX, mL.rows - minX).colRange(minX, mL.cols - minX));

float *d_mL, *windowL;
float *d_mR, *windowR;
float *dif;
float *d_disparity;

cudaMallocManaged(&dif, correlationWindow * sizeof(float));
cudaMallocManaged(&windowL, correlationWindow * sizeof(float));
cudaMallocManaged(&windowR, correlationWindow * sizeof(float));
cudaMallocManaged(&d_mL, mL.rows * mL.cols * sizeof(float));
cudaMallocManaged(&d_mR, mR.rows * mR.cols * sizeof(float));
cudaMallocManaged(&d_disparity, disparity.rows * disparity.cols * sizeof(float)); 

/*dif = new float[correlationWindow];
windowL = new float[correlationWindow];
windowR = new float[correlationWindow];
d_mL = new float[mL.rows * mL.cols];
d_mR = new float[mR.rows * mR.cols];
d_disparity = new float[disparity.rows * disparity.cols]; */


memcpy(d_mL, mL.data, mL.rows * mL.cols * sizeof(float));
memcpy(d_mR, mR.data, mR.rows * mR.cols * sizeof(float));
memcpy(d_disparity, disparity.data, disparity.rows * disparity.cols * sizeof(float));

int ind = 0;
int colmL = mL.cols;
int colmLO = mLO.cols;
int npox, npoy;

clock_t begin = clock();

computeSMAD2 <<<70, 50>>>(minX, d_mL, d_mR, dif, windowL, windowR, d_disparity, mLO.cols, mL.cols, searchWindow);
//computeSMAD2 <<<mLO.rows, mLO.cols>>>(minX, d_mL, d_mR, dif, windowL, windowR, d_disparity, mLO.cols, mL.cols, searchWindow);
cudaDeviceSynchronize();

/*
for (int poy = minX; poy < mR.rows - minX; poy++)
{
    for (int pox = minX; pox < mR.cols - minX; pox++)
    {
        //////////////////////// DE GAUCHE A DROITE
        npox = max(minX, pox - searchWindow);
        npoy = poy;

        computeSMAD2(minX, d_mL, d_mR, disparity, d_disparity, windowL, windowR, dif, colmLO, colmL, pox, poy, npox, npoy);

        ind++;
    }
} */

clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

cout << "time " << elapsed_secs << endl;

cudaFree(dif);
cudaFree(windowL);
cudaFree(windowR);
cudaFree(d_mL);
cudaFree(d_mR);
cudaFree(d_disparity); 

return disparity;
}

int main(int argc, char* argv[]) {

int minX = (int)floor((float)(widthWindow / 2));

Mat mL2 = Mat::ones(70, 50, CV_8UC1) * 255;
Mat mR2 = Mat::zeros(70, 50, CV_8UC1);

Mat disparity = SMAD2(minX, mL2, mR2);

disparity.convertTo(disparity, CV_8UC1);
Mat im;
hconcat(mL2, mR2, im);
hconcat(im, disparity, im);
imshow("test", im);
waitKey(0);

return 0;
}

Answer 1

你的代码错了。正如我所看到的，所有线程都会对同一个数组 dif 进行排序。它导致内存违规。

在gpu上使用推力库进行排序

1 个答案: