Question

我已经实现了CUDA版本的反离散余弦变换（IDCT），将MATLAB内置函数cuIDCT.cu“转换”为CUDA：

我的实施是#include <stdio.h> #include <stdlib.h> #include <cuda.h> #include <cufft.h> #include <cuComplex.h> // round up n/m inline int iDivUp(int n, int m) { return (n + m - 1) / m; } typedef cufftComplex complex; #define PI 3.1415926535897932384626433832795028841971693993751 __global__ void idct_ComputeWeightsKernel(const int n, complex *ww) { const int pos = threadIdx.x + blockIdx.x * blockDim.x; if (pos >= n) return; ww[pos].x = sqrtf(2*n) * cosf(pos*PI/(2*n)); ww[pos].y = sqrtf(2*n) * sinf(pos*PI/(2*n)); } __global__ void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; // Compute precorrection factor ww[0].x = ww[0].x / sqrtf(2); ww[0].y = ww[0].y / sqrtf(2); y[iy + ix*m].x = ww[iy].x * b[pos]; y[iy + ix*m].y = ww[iy].y * b[pos]; } __global__ void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; yy[iy + ix*n].x = y[pos].x / (float) n; yy[iy + ix*n].y = y[pos].y / (float) n; } __global__ void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; // Re-order elements of each column according to equations (5.93) and (5.94) in Jain if (iy < n/2) { a[ix + 2*iy*n] = yy[pos].x; a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x; } } /** * a = idct(b), where a is of size [n m]. * @param b, input array * @param n, first dimension of a * @param m, second dimension of a * @param a, output array */ void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m] { const int data_size = n * m * sizeof(float); // device memory allocation float *d_in, *d_out; cudaMalloc(&d_in, data_size); cudaMalloc(&d_out, data_size); // transfer data from host to device cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice); // compute IDCT using CUDA // begin============================================ // Compute weights complex *ww; cudaMalloc(&ww, n*sizeof(complex)); dim3 threads(256); dim3 blocks(iDivUp(n, threads.x)); idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww); complex *y; complex *yy; cufftHandle plan; dim3 threads1(32, 6); dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case int Length[1] = {m}; // for each IFFT, the length is m cudaMalloc(&y, n*m*sizeof(complex)); idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y); cufftPlanMany(&plan, 1, Length, Length, 1, m, Length, 1, m, CUFFT_C2C, n); cufftExecC2C(plan, y, y, CUFFT_INVERSE); // y is of size [n m] cudaMalloc(&yy, n*m*sizeof(complex)); Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy); Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out); // end============================================ // transfer result from device to host cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost); // cleanup cufftDestroy(plan); cudaFree(ww); cudaFree(y); cudaFree(yy); cudaFree(d_in); cudaFree(d_out); }，当 m = n 时， m 和 n 都是偶数。

cuIDCT.cu

cuIDCT.cu

然后我使用以下代码将我的CUDA IDCT（即idct.m）的结果与MATLAB main.cpp进行了比较：

测试main.m功能，
一个MATLAB主函数#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <helper_functions.h> #include <stdlib.h> #include <stdio.h> // N must equal to M, and both must be even numbers #define N 256 #define M 256 void WriteDataFile(const char *name, int w, int h, const float *in, const float *out) { FILE *stream; stream = fopen(name, "wb"); float data = 202021.25f; fwrite(&data, sizeof(float), 1, stream); fwrite(&w, sizeof(w), 1, stream); fwrite(&h, sizeof(h), 1, stream); for (int i = 0; i < h; i++) for (int j = 0; j < w; j++) { const int pos = j + i * h; fwrite(in + pos, sizeof(float), 1, stream); fwrite(out + pos, sizeof(float), 1, stream); } fclose(stream); } void cuIDCT(float *b, int n, int m, float *a); int main() { // host memory allocation float *h_in = new float [N * M]; float *h_out = new float [N * M]; float *h_temp = new float [N * M]; // input data initialization for (int i = 0; i < N * M; i++) { h_in[i] = (float)rand()/(float)RAND_MAX; h_out[i] = h_in[i]; h_temp[i] = h_in[i]; } // please comment either one case for testing // test case 1: use cuIDCT.cu once // cuIDCT(h_in, N, M, h_out); // test case 2: iteratively use cuIDCT.cu for (int i = 0; i < 4; i++) { if (i % 2 == 0) cuIDCT(h_out, N, M, h_temp); else cuIDCT(h_temp, N, M, h_out); } // write data, for further visualization using MATLAB WriteDataFile("test.flo", N, M, h_in, h_out); // cleanup delete [] h_in; delete [] h_out; delete [] h_temp; cudaDeviceReset(); }，用于读取CUDA的结果，并将其与MATLAB进行比较。

的main.cpp

clc;clear;

% read
[h_in, h_out] = read_data('test.flo');

% MATLAB result, for test case 1, comment the for-loop
matlab_out = h_in;
for i = 1:4
    matlab_out = idct(matlab_out);
end

% compare
err = matlab_out - h_out;

% show
figure(1);
subplot(221);   imshow(h_in,  []);       title('h\_in');        colorbar
subplot(222);   imshow(h_out, []);       title('h\_out');       colorbar
subplot(223);   imshow(matlab_out, []);  title('matlab\_out');  colorbar
subplot(224);   imshow(err,   []);       title('error map');    colorbar

disp(['maximum error between CUDA and MATLAB is ' ...
        num2str(max(max(abs(err))))])

的main.m

main.cpp

我在Windows 7中使用Nvidia GPU Tesla K20c在Visual Studio 11（即VS2012）上运行代码，使用CUDA Toolkit 7.5版，我的MATLAB版本是R2015b。

我的测试步骤：

对于测试用例1 。取消评论测试用例1和评论测试用例2。
1. 运行main.m。
2. 在MATLAB中运行main.m。
3. 重复步骤1和步骤2（不做任何更改，只需重新运行代码）。

我重复了第3步20次。输出结果未更改，main.cpp的结果为：

results of test case 1

最大误差为7.7152e-07。

对于测试用例2 。取消评论测试用例2和评论测试用例1。
1. 运行main.m。
2. 在MATLAB中运行main.m。
3. 重复步骤1和步骤2（不做任何更改，只需重新运行代码）。

我重复了第3步20次。输出结果已更改，结果为cuIDCT.cu（没有足够的声誉来放置所有图像，只有错误的情况显示在下面）：

one situation (the wrong one) of test case 2

最大误差为0.45341（2次），0.44898（1次），0.26186（1次），0.26301次（1次）和9.5716e-07次（15次）。

从测试结果来看，我的结论是：

从测试用例1开始：idct.m在数值上正确（错误~10 ^ -7）到cuIDCT.cu。
从测试用例2：递归使用cuIDCT.cu导致不稳定的结果（即每次重新运行代码时输出都会改变，有时可能在数字上错误，错误~0.1）

我的问题：

从测试用例1我们知道idct.m在cuIDCT.cu数字上是正确的。但是，每次重新运行代码时，为什么递归使用#options br { content: ""; } #options br:after { content: ", "; }会导致不同的输出结果？

非常感谢任何帮助或建议。

Answer 1

我相信，由于idct_ComputeEvenKernel中的此代码，您的结果会出现变化：

// Compute precorrection factor
ww[0].x = ww[0].x / sqrtf(2);
ww[0].y = ww[0].y / sqrtf(2);

目前还没有完全清楚你的意图是什么，但是这个代码可以做你想做的事情是值得怀疑的。您可能对CUDA执行模型感到困惑。

上面的代码将由为您通过线程检查的内核启动的每个 CUDA线程执行：

if (ix >= n || iy >= m) return;

我相信这意味着65536个线程将在该内核中执行此代码。此外，线程将以或多或少的任何顺序执行该代码（并非所有CUDA线程都以锁定步骤执行）。当他们试图将他们的价值写到位置ww[0]时，他们甚至可能互相踩踏。所以ww[0]的最终结果将是非常不可预测的。

当我注释掉那些代码行时，结果对我来说变得稳定（尽管与那些行的结果不同），从一开始就不变。

我想指出别的东西。无论您在何处计算复杂数量的.x和.y值，我的建议都是从此处重新编写代码（例如）：

y[iy + ix*m].x = ww[iy].x * b[pos];
y[iy + ix*m].y = ww[iy].y * b[pos];

到此：

complex temp1, temp2;
temp1 = ww[iy];
temp2.x = temp1.x * b[pos];
temp2.y = temp2.y * b[pos];
y[iy + ix*m] = temp2;

至少根据我的测试，编译器似乎没有为您进行此优化，并且一个副作用的好处是使用{{1}来测试代码要容易得多}。在第一个实现中，编译器将加载cuda-memcheck --tool initcheck ...作为8字节数量，修改它的4或8个字节，然后将y[iy + ix*m]存储为8字节数量。第二个实现应该更有效（它消除y[iy + ix*m]的负载），并消除未初始化数量（y[]）的负载，y[]工具将报告为危险。

无论您运行代码的1遍版本还是代码的4遍版本，我都可以描述这种可变性。因此，我认为您关于1-pass版本正确的陈述是可疑的。我认为如果你足够运行1遍版本，你最终会看到可变性（虽然它可能需要不同的初始内存条件，或者在不同的GPU类型上运行）。即使在你自己的结果中，我们也看到4个密码的20次运行中有15个产生＆＃34;正确＆＃34;结果，即残余误差为~1e-7

这是我修改后的cuda-memcheck文件，根据您发布的版本here进行了修改。我在下面做的假设是你只想在cuIDCT.cu上计算一次缩放，在这种情况下，我们可以轻松地将该算法作为前一个ww[0]的附录来处理：

idct_ComputeWeightsKernel

您注意我在#include <stdio.h> #include <stdlib.h> #include <cuda.h> #include <cufft.h> #include <cuComplex.h> #include <helper_cuda.h> #include "assert.h" // round up n/m inline int iDivUp(int n, int m) { return (n + m - 1) / m; } typedef cufftComplex complex; #define PI 3.1415926535897932384626433832795028841971693993751 #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) inline void __cufftSafeCall(cufftResult err, const char *file, const int line) { if( CUFFT_SUCCESS != err) { fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ _cudaGetErrorEnum(err)); \ cudaDeviceReset(); assert(0); \ } } __global__ void idct_ComputeWeightsKernel(const int n, complex *ww) { const int pos = threadIdx.x + blockIdx.x * blockDim.x; if (pos >= n) return; complex temp; temp.x = sqrtf(2*n) * cosf(pos*PI/(2*n)); temp.y = sqrtf(2*n) * sinf(pos*PI/(2*n)); if (pos == 0) { temp.x /= sqrtf(2); temp.y /= sqrtf(2);} ww[pos] = temp; } __global__ void idct_ComputeEvenKernel(const float *b, const int n, const int m, complex *ww, complex *y) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; /* handle this in idct_ComputeWeightsKernel // Compute precorrection factor ww[0].x = ww[0].x / sqrtf(2); ww[0].y = ww[0].y / sqrtf(2); */ complex temp1, temp2; temp1 = ww[iy]; temp2.x = temp1.x * b[pos]; temp2.y = temp1.y * b[pos]; y[iy + ix*m] = temp2; } __global__ void Reordering_a0_Kernel(complex *y, const int n, const int m, complex *yy) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; complex temp1, temp2; temp1 = y[pos]; temp2.x = temp1.x / (float) n; temp2.y = temp1.y / (float) n; yy[iy + ix*n] = temp2; } __global__ void Reordering_a_Kernel(complex *yy, const int n, const int m, float *a) { const int ix = threadIdx.x + blockIdx.x * blockDim.x; const int iy = threadIdx.y + blockIdx.y * blockDim.y; if (ix >= n || iy >= m) return; const int pos = ix + iy*n; // Re-order elements of each column according to equations (5.93) and (5.94) in Jain if (iy < n/2) { a[ix + 2*iy*n] = yy[pos].x; a[ix + (2*iy+1)*n] = yy[ix + (m-iy-1)*n].x; } } /** * a = idct(b), where a is of size [n m]. * @param b, input array * @param n, first dimension of a * @param m, second dimension of a * @param a, output array */ void cuIDCT(float *h_in, int n, int m, float *h_out) // a is of size [n m] { const int data_size = n * m * sizeof(float); // device memory allocation float *d_in, *d_out; checkCudaErrors(cudaMalloc(&d_in, data_size)); checkCudaErrors(cudaMalloc(&d_out, data_size)); // transfer data from host to device checkCudaErrors(cudaMemcpy(d_in, h_in, data_size, cudaMemcpyHostToDevice)); // compute IDCT using CUDA // begin============================================ // Compute weights complex *ww; checkCudaErrors(cudaMalloc(&ww, n*sizeof(complex))); dim3 threads(256); dim3 blocks(iDivUp(n, threads.x)); idct_ComputeWeightsKernel<<<blocks, threads>>>(n, ww); complex *y; complex *yy; cufftHandle plan; dim3 threads1(32, 6); dim3 blocks2(iDivUp(n, threads1.x), iDivUp(m, threads1.y)); // for even case int Length[1] = {m}; // for each IFFT, the length is m checkCudaErrors(cudaMalloc(&y, n*m*sizeof(complex))); idct_ComputeEvenKernel<<<blocks2, threads1>>>(d_in, n, m, ww, y); cufftSafeCall(cufftPlanMany(&plan, 1, Length, Length, 1, m, Length, 1, m, CUFFT_C2C, n)); cufftSafeCall(cufftExecC2C(plan, y, y, CUFFT_INVERSE)); // y is of size [n m] checkCudaErrors(cudaMalloc(&yy, n*m*sizeof(complex))); Reordering_a0_Kernel<<<blocks2, threads1>>>(y, n, m, yy); cudaMemset(d_out, 0, data_size); Reordering_a_Kernel<<<blocks2, threads1>>>(yy, n, m, d_out); // end============================================ // transfer result from device to host checkCudaErrors(cudaMemcpy(h_out, d_out, data_size, cudaMemcpyDeviceToHost)); // cleanup cufftDestroy(plan); checkCudaErrors(cudaFree(ww)); checkCudaErrors(cudaFree(y)); checkCudaErrors(cudaFree(yy)); checkCudaErrors(cudaFree(d_in)); checkCudaErrors(cudaFree(d_out)); }上加了cudaMemset，因为它帮助我清除了d_out的问题。它不应该是必要的，你可以删除它。

递归使用自我实现的cuIDFT.cu会导致每次重新运行代码时更改输出

1 个答案: