我正在努力学习CUDA。我有一些MPI的基本经验,所以我想我会从一些非常简单的向量操作开始。我正在尝试编写一个并行点产品的东西。我要么在向CUDA设备分配/写入内存时遇到问题,要么我没有正确地将它带回主机(cudaMemcpy())。
/*Code for a CUDA test project doing a basic dot product with doubles
*
*
*
*/
#include <stdio.h>
#include <cuda.h>
__global__ void GPU_parallelDotProduct(double *array_a, double *array_b, double *dot){
dot[0] += array_a[threadIdx.x] * array_b[threadIdx.x];
}
__global__ void GPU_parallelSetupVector(double *vector, int dim, int incrSize, int start){
if(threadIdx.x<dim){
vector[threadIdx.x] = start + threadIdx.x * incrSize;
}
}
__host__ void CPU_serialDot(double *first, double *second, double *dot, int dim){
for(int i=0; i<dim; ++i){
dot[0] += first[i] * second[i];
}
}
__host__ void CPU_serialSetupVector(double *vector, int dim, int incrSize, int start){
for(int i=0; i<dim; ++i){
vector[i] = start + i * incrSize;
}
}
int main(){
//define array size to be used
//int i,j;
int VECTOR_LENGTH = 8;
int ELEMENT_SIZE = sizeof(double);
//arrays for dot product
//host
double *array_a = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *array_b = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product = (double*) malloc(ELEMENT_SIZE);
double host_dot_product = 0.0;
//fill with values
CPU_serialSetupVector(array_a, VECTOR_LENGTH, 1, 0);
CPU_serialSetupVector(array_b, VECTOR_LENGTH, 1, 0);
//host dot
CPU_serialDot(array_a, array_b, &host_dot_product, VECTOR_LENGTH);
//device
double *dev_array_a;
double *dev_array_b;
double *dev_dot;
//allocate cuda memory
cudaMalloc((void**)&dev_array_a, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_b, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot, ELEMENT_SIZE);
//copy to from host to device
cudaMemcpy(dev_array_a, array_a, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_array_b, array_b, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot, &dev_dot_product, ELEMENT_SIZE, cudaMemcpyHostToDevice);
//init vectors
//GPU_parallelSetupVector<<<1, VECTOR_LENGTH>>>(dev_array_a, VECTOR_LENGTH, 1, 0);
//GPU_parallelSetupVector<<<1, VECTOR_LENGTH>>>(dev_array_b, VECTOR_LENGTH, 1, 0);
//GPU_parallelSetupVector<<<1, 1>>>(dev_dot, VECTOR_LENGTH, 0, 0);
//perform CUDA dot product
GPU_parallelDotProduct<<<1, VECTOR_LENGTH>>>(dev_array_a, dev_array_b, dev_dot);
//get computed product back to the machine
cudaMemcpy(dev_dot, dev_dot_product, ELEMENT_SIZE, cudaMemcpyDeviceToHost);
FILE *output = fopen("test_dotProduct_1.txt", "w");
fprintf(output, "HOST CALCULATION: %f \n", host_dot_product);
fprintf(output, "DEV CALCULATION: %f \n", dev_dot_product[0]);
fprintf(output, "PRINTING DEV ARRAY VALS: ARRAY A\n");
for(int i=0; i<VECTOR_LENGTH; ++i){
fprintf(output, "value %i: %f\n", i, dev_array_a[i]);
}
free(array_a);
free(array_b);
cudaFree(dev_array_a);
cudaFree(dev_array_b);
cudaFree(dev_dot);
return(0);
}
以下是输出示例:
HOST CALCULATION: 140.000000
DEV CALCULATION: 0.000000
PRINTING DEV ARRAY VALS: ARRAY A
value 0: -0.000000
value 1: 387096841637590350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 2: -9188929998371095800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 3: 242247762331550610000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 4: -5628111589595087500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 5: 395077289052074410000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
value 6: 0.000000
value 7: -13925691551991564000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000
答案 0 :(得分:4)
我可以看到两个问题:
您的GPU点产品包含内存竞争:
dot[0] += array_a[threadIdx.x] * array_b[threadIdx.x];
这是不安全的 - 块中的每个线程都会尝试使用其结果写入/覆盖相同的内存位置。编程模型不保证在多个线程尝试向同一块内存写入不同值时会发生什么。
dev_array_a
无法由主持人直接访问,它是一个
GPU内存中的指针。您必须使用设备将副本托管为有效
如果要检查dev_array_a
。关于在另一个答案中进行错误检查的建议也是一个非常好的观点。每个API调用都会返回一个状态,您应该检查所有调用的状态,以确认在运行时没有出现错误或错误。
答案 1 :(得分:3)
检查CUDA运行时调用的状态(如cudaMalloc,cudaMemcpy和内核启动)是个好主意。您可以在每次调用之后执行以下操作,或者将其包装在某种宏中,并在宏中包装CUDA运行时调用。
if (cudaSuccess != cudaGetLastError())
printf( "Error!\n" );
现在,我不确定这是否是你的问题,但这样做可以明显地解决问题。