我正在尝试使用带有cuda和cufft库的opencv优化代码。每次执行快速傅立叶变换时,都必须从GpuMat下载cv :: Mat,然后执行cufft。 (请参见下面的代码),然后再次下载fft的结果。有什么办法可以优化这个吗?我想知道是否可以通过任何方式直接通过GpuMat,而不必下载它。
std::vector<cv::cuda::GpuMat> ReconClass::FFT2(std::vector<cv::cuda::GpuMat>& mat, int height, int width)
{
cufftHandle plan;
cufftComplex* data, * datao, * devdata, * devdatao;
size_t arraySize = sizeof(cufftComplex) * mat[0].size().area();
cudaMallocHost((void**)& data, arraySize);
cudaMallocHost((void**)& datao, arraySize);
cudaMalloc((void**)& devdata, arraySize);
cudaMalloc((void**)& devdatao, arraySize);
cv::Mat iReal;
cv::Mat iImag;
mat[0].download(iReal);
mat[1].download(iImag);
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
data[i * width + j].x = iReal.at<float>(i, j);
data[i * width + j].y = iImag.at<float>(i, j);
}
}
cudaMemcpy(devdata, data, arraySize, cudaMemcpyHostToDevice);
cufftPlan2d(&plan, height, width, CUFFT_C2C);
if (!plan)
std::cout << "the cufftPlan2d plan returned is null" << std::endl;
cufftExecC2C(plan, (cufftComplex*)devdata, (cufftComplex*)devdatao, CUFFT_FORWARD);
cudaMemcpy(datao, devdatao, arraySize, cudaMemcpyDeviceToHost);
cv::Mat realRecon(height, width, CV_32F);
cv::Mat imagRecon(height, width, CV_32F);
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
realRecon.at<float>(i, j) = datao[i * width + j].x;
imagRecon.at<float>(i, j) = datao[i * width + j].y;
}
}
cv::cuda::GpuMat mat1, mat2;
mat1.upload(realRecon);
mat2.upload(imagRecon);
std::vector<cv::cuda::GpuMat> re = { mat1 , mat2 };
cufftDestroy(plan);
cudaFreeHost(data);
cudaFreeHost(datao);
cudaFree(devdata);
cudaFree(devdatao);
return re;
}
答案 0 :(得分:-2)
我能够避免将其复制到CPU并将其复制回设备。此外,进行原位FFT也有助于提高性能。我在下面粘贴了我的代码。
void Dataransfer2Cuda(const cv::InputArray _dReal, const cv::InputArray _dImag, float2* zCufftcomplex)
{
const cv::cuda::GpuMat Real = _dReal.getGpuMat();
const cv::cuda::GpuMat Imag = _dImag.getGpuMat();
dim3 cthreads(32, 32);
dim3 cblocks(
static_cast<int>(std::ceil(Real.size().width /
static_cast<double>(cthreads.x))),
static_cast<int>(std::ceil(Real.size().height /
static_cast<double>(cthreads.y))));
Kernel_DataTransfer2Cuda << <cblocks, cthreads >> > (Real, Imag, zCufftcomplex);
if (cudaSuccess != cudaGetLastError())
std::cout << "Dataransfer2Cuda(): gave an error" << std::endl;
return;
}
void DataransferFromCuda(const float2* zCufftcomplex, cv::OutputArray _outputReal, cv::OutputArray _outputImag, std::size_t iWidth, std::size_t iHeight)
{
_outputReal.create(iHeight, iWidth, CV_32F);
_outputImag.create(iHeight, iWidth, CV_32F);
cv::cuda::GpuMat outputReal = _outputReal.getGpuMat();
cv::cuda::GpuMat outputImag = _outputImag.getGpuMat();
dim3 cthreads(32, 32);
dim3 cblocks(
static_cast<int>(std::ceil(outputReal.size().width /
static_cast<double>(cthreads.x))),
static_cast<int>(std::ceil(outputReal.size().height /
static_cast<double>(cthreads.y))));
Kernel_DataTransferFromCuda << <cblocks, cthreads >> > (zCufftcomplex, outputReal, outputImag);
if (cudaSuccess != cudaGetLastError())
std::cout << "DataransferFromCuda(): gave an error" << std::endl;
return;
}
std::vector<cv::cuda::GpuMat> ReconClass::FFT2(std::vector<cv::cuda::GpuMat>& mat, int height, int width)
{
cufftHandle plan;
cufftComplex* devdata;
size_t arraySize = sizeof(cufftComplex) * mat[0].size().area();
cudaMalloc((void**)& devdata, arraySize);
Dataransfer2Cuda(mat[0], mat[1], devdata);
cufftPlan2d(&plan, height, width, CUFFT_C2C);
if (!plan)
std::cout << "the cufftPlan2d plan returned is null" << std::endl;
cufftExecC2C(plan, (cufftComplex*)devdata, (cufftComplex*)devdata, CUFFT_FORWARD);
cv::cuda::GpuMat mat1, mat2;
DataransferFromCuda(devdata, mat1, mat2, width, height);
std::vector<cv::cuda::GpuMat> re = { mat1 , mat2 };
cufftDestroy(plan);
cudaFree(devdata);
return re;
}