我刚刚进入CUDA而且我遇到了一个我无法理解的问题。基本上我正在编写一个程序来使用Simpson的方法来数值积分函数f(x)= x ^ 2。我这样做的方法是在每个框之间创建一个边界数组,其区域是用Simpson方法计算的,将该边界数组传递给GPU,让每个处理器找到每个有界框的区域并返回一个结果区域的数组。然后将这些区域相加以获得完整的积分。当我尝试访问GPU上的边界数组时,我的问题出现了。数组很好,在CPU上有适当的值,但在我复制并在GPU上访问它之后,这些值都是无意义的,我找不到原因。我的代码如下,任何帮助都将非常感激。
第一个类是获取用户输入并定义CPU数组的主类。
#include <iostream> //Necessary for std::cout
#include <iomanip> //Necessary for std::setprecision
#include <ctime> //Necessary for clock_t
using namespace std;
double* gpu_run(double * h_bound, double * h_resultArr, int SIZE);
int main() {
double step = 0.5, upper = 0, result = 0;
double * h_bound = NULL, * h_resultArr = NULL;
int SIZE = 0;
cout << "Enter the upper bound: ";
cin >> upper;
SIZE = upper/step + 1; //The number of bounds, which is one more than the number of integration times
h_bound = new double[SIZE];
h_resultArr = new double[SIZE-1];
for (int i = 0; i < SIZE; i++){
h_bound[i] = i*step;
}
clock_t t = clock();
h_resultArr = gpu_run(h_bound, h_resultArr, SIZE);
for (int i = 0; i < SIZE; i++){
result += h_resultArr[i];
}
t = clock() - t;
cout << "Calculation is done and took " << ((double)t)/CLOCKS_PER_SEC << " seconds." << endl;
cout << "The integral of x^2 from 0 to " << upper << ", using Simpson's Method is: " << setprecision(10) << result << endl;
return 0;
}
此时main方法调用了gpu_run,它是包含进行实际计算的方法的cuda代码。当我使用3的上限作为积分时(因为答案应该是9),并且使用0.5的步长,我得到0,0.5,1,1.5,2,2.5和3的界限,因为我期望。 gpu_run代码是
#include <iostream>
#include <cuda_runtime.h>
#include <cstdio>
#include "gpu_run.h"
using namespace std;
double* gpu_run(double * h_bound, double * h_resultArr, int SIZE) {
double * d_bound = NULL;
cudaMalloc((void **)&d_bound, sizeof(double)*SIZE);
cudaMemcpy(d_bound, h_bound, SIZE, cudaMemcpyHostToDevice);
double * d_resultArr = NULL;
cudaMalloc((void **)&d_resultArr, sizeof(double)*(SIZE-1));
int threadsPerBlock = 256;
int blocksPerGrid =(SIZE + threadsPerBlock - 1) / threadsPerBlock;
simpsons<<<blocksPerGrid, threadsPerBlock>>>(d_bound, d_resultArr, SIZE);
cudaMemcpy(h_resultArr, d_resultArr, SIZE-1, cudaMemcpyDeviceToHost);
return h_resultArr;
}
在这个程序中,d_ prescript用于指定GPU上存在的数组,存储在h_bound中的值在这里仍然是正确的。最后,我有CUDA方法simpsons的标题,称为
__global__ void simpsons(double * bound, double * resultArr, int SIZE){
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < SIZE-1){
double a = bound[i];
double b = bound[i+1];
printf("i: %d lower bound: %d upper bound: %d \n", i, bound[i], bound[i+1]);
resultArr[i] = ((b-a)/6)*(a*a + (a+b)*(a+b) + b*b);
}
}
对于每个处理器,我需要它来访问其各自&#34;框的两个边界&#34;在函数下并使用simpson方法在这两个边界处计算面积,但是,此方法中绑定数组中的值是无意义值。我究竟做错了什么?我觉得这是一个非常愚蠢的错误,但我似乎无法找到它。
答案 0 :(得分:1)
当我更改您的代码以解决@talonmies指出的问题,并将printf格式说明符修改为正确的打印出float
/ double
数量的问题时,我得到的似乎是根据您的描述更正结果:
cudaMemcpy
操作以包含sizeof(double)
%d
替换为%f
double
转换为int
修正了代码和结果:
$ cat t495.cu
#include <iostream> //Necessary for std::cout
#include <iomanip> //Necessary for std::setprecision
#include <ctime> //Necessary for clock_t
using namespace std;
__global__ void simpsons(double * bound, double * resultArr, int SIZE){
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < SIZE-1){
double a = bound[i];
double b = bound[i+1];
printf("i: %d lower bound: %lf upper bound: %lf \n", i, bound[i], bound[i+1]);
resultArr[i] = ((b-a)/6)*(a*a + (a+b)*(a+b) + b*b);
}
}
double* gpu_run(double * h_bound, double * h_resultArr, int SIZE) {
double * d_bound = NULL;
cudaMalloc((void **)&d_bound, sizeof(double)*SIZE);
cudaMemcpy(d_bound, h_bound, SIZE*sizeof(double), cudaMemcpyHostToDevice);
double * d_resultArr = NULL;
cudaMalloc((void **)&d_resultArr, sizeof(double)*(SIZE-1));
int threadsPerBlock = 256;
int blocksPerGrid =(SIZE + threadsPerBlock - 1) / threadsPerBlock;
simpsons<<<blocksPerGrid, threadsPerBlock>>>(d_bound, d_resultArr, SIZE);
cudaMemcpy(h_resultArr, d_resultArr, (SIZE-1)*sizeof(double), cudaMemcpyDeviceToHost);
return h_resultArr;
}
int main() {
double step = 0.5, upper = 0, result = 0;
double * h_bound = NULL, * h_resultArr = NULL;
int SIZE = 0;
cout << "Enter the upper bound: ";
cin >> upper;
SIZE = (int)(upper/step + 1); //The number of bounds, which is one more than the number of integration times
h_bound = new double[SIZE];
h_resultArr = new double[SIZE-1];
for (int i = 0; i < SIZE; i++){
h_bound[i] = i*step;
}
clock_t t = clock();
h_resultArr = gpu_run(h_bound, h_resultArr, SIZE);
for (int i = 0; i < SIZE; i++){
result += h_resultArr[i];
}
t = clock() - t;
cout << "Calculation is done and took " << ((double)t)/CLOCKS_PER_SEC << " seconds." << endl;
cout << "The integral of x^2 from 0 to " << upper << ", using Simpson's Method is: " << setprecision(10) << result << endl;
return 0;
}
$ nvcc -arch=sm_20 -o t495 t495.cu
$ cuda-memcheck ./t495
========= CUDA-MEMCHECK
Enter the upper bound: 3
i: 0 lower bound: 0.000000 upper bound: 0.500000
i: 1 lower bound: 0.500000 upper bound: 1.000000
i: 2 lower bound: 1.000000 upper bound: 1.500000
i: 3 lower bound: 1.500000 upper bound: 2.000000
i: 4 lower bound: 2.000000 upper bound: 2.500000
i: 5 lower bound: 2.500000 upper bound: 3.000000
Calculation is done and took 1.08 seconds.
The integral of x^2 from 0 to 3, using Simpson's Method is: 9
========= ERROR SUMMARY: 0 errors
$