我试图简单地在CUDA中并行递增几个矩阵值并尝试将它们复制回主存储器。但是当我在线程函数返回后将它们打印出来时,值是相同的。我甚至试过用一个线程运行程序,但没有运气。任何帮助将不胜感激。
我的代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#define BLOCK_SIZE 1024
#define MAX_N 100000000
#define MAX_THREADS 1024
int num_threads;
int count; // Count of threads that have updated their partition
int size;
//int increment; // VS
int * inc2;
//int my_start;
//Host data
int * thread_ids;
//nvcc -arch=sm_20 -o nbody.exe nbody.cu (compilation)
__global__ void pcyc_red(float * a, float * b, float * c, float * D, float * X,
float * a2, float * b2, float * c2, float * D2,
int * inc2_dev, int * size_dev, int * num_threads_dev){
//__threadfence();
int thread_id = threadIdx.x + (blockIdx.x * blockDim.x);
float k1;
float k2;
int i;
int start = 0;
//int end = size_dev-1;
//int inc2_dev = inc2_dev1[0];
//int inc_dev = *inc_dev1;
//int size_dev = size_dev1[0];
int nthreads = num_threads_dev[0];
//Thread work assignment
int chunk_size = size_dev[0]/nthreads;
int my_start = thread_id*(chunk_size);
int my_end = start + ((thread_id + 1)*chunk_size - 1);
//__threadfence();
__syncthreads();
//Forward Reduction
for(i = my_start; i <= my_end; ++i){
a[i] = a[i]++;
b[i] = b[i]++;
c[i] = c[i]++;
D[i] = D[i]++;
X[i] = X[i]++;
}
__threadfence();
//__syncthreads();
}//Device Function
float* init_vector(int size){
float* output;
output = (float*) calloc(size, sizeof(float));
int i;
for(i = 0; i < size; ++i){
output[i] = 2.0;
}
return output;
}
float* init_vector_ac(int s){
//s will be used for size-1 not to be confused for size.
float* output;
output = (float*) calloc(s, sizeof(float));
int i;
for(i = 0; i < s; ++i){
output[i] = -1.0;
}
return output;
}
// Main program
int main(int argc, char *argv[]) {
//num_threads -> atoi(argv[argc-1]);
//struct timeval start, stop;
float total_time;
int i;
///Host structures
float* a;
float* b;
float* c;
float* D;
float* X;
//increment = 2; // VS
inc2 = (int*) malloc(sizeof(int));
inc2[0] = 1;
//size = (int*) malloc(sizeof(int));
//num_threads = (int*) malloc(sizeof(int));
//my_start = 0;
//wait_flag = false;
///Device Data
//SYSTEM * sys_dev;
float * a_dev;
float * b_dev;
float * c_dev;
float * D_dev;
float * X_dev;
float * a2_dev;
float * b2_dev;
float * c2_dev;
float * D2_dev;
//float * X2_dev;
//int * inc_dev;
int * inc2_dev;
//int * mstart_dev;
int * size_dev;
int * num_threads_dev;
int result_var;
//int final_inc2;
cudaEvent_t start, stop; // GPU timing variables
//struct timeval cpu_start, cpu_stop; // CPU timing variables
// float time_array[10];
// Timing initializations
cudaEventCreate(&start);
cudaEventCreate(&stop);
if (argc != 3)
{
printf("Use: <executable_name> <size> <num_threads>\n");
exit(0);
}
if ((size = atoi(argv[argc-2])) > MAX_N)
{
printf("Maximum number of nodes allowed: %d\n", MAX_N);
exit(0);
};
if ((num_threads = atoi(argv[argc-1])) > MAX_THREADS)
{
printf("Maximum number of threads allowed: %d.\n", MAX_THREADS);
exit(0);
};
int size_array = (size) * sizeof(float);
int size_array2 = (size - 1) * sizeof(float);
// Initialize host tridiagonal matrix
a = init_vector_ac(size-1); // a[i] = -1.0
b = init_vector(size); // b[i] = 2.0
c = init_vector_ac(size-1); // c[i] = -1.0
D = init_vector(size); // D[i] = 2.0
X = init_vector(size); // X[i] = 2.0
//xs = init_vector_err(size);
// Shift elements of a by 1
for(i = size-1; i > 0; i--) a[i] = a[i-1];
a[0] = 0.0;
thread_ids = (int*) calloc(num_threads, sizeof(int));
count = 0;
for(i = 0; i < num_threads; ++i){
thread_ids[i] = i;
}
//Cuda Operation
cudaEventRecord( start, 0);
cudaMalloc((void **) &a_dev, size);
cudaMalloc((void **) &b_dev, size);
cudaMalloc((void **) &c_dev, size);
cudaMalloc((void **) &D_dev, size);
cudaMalloc((void **) &X_dev, size);
cudaMalloc((void **) &a2_dev, size);
cudaMalloc((void **) &b2_dev, size);
cudaMalloc((void **) &c2_dev, size);
cudaMalloc((void **) &D2_dev, size);
//cudaMalloc((void**)&inc_dev, sizeof(int));
cudaMalloc((void**)&inc2_dev, sizeof(int));
//cudaMalloc((void**)&mstart_dev, sizeof(int));
cudaMalloc((void**)&size_dev, sizeof(int));
cudaMalloc((void**)&num_threads_dev, sizeof(int));
cudaMemcpy(a_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D_dev, D, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(X_dev, X, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(a2_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b2_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c2_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D2_dev, D, size_array, cudaMemcpyHostToDevice);
//cudaMemcpy(inc_dev, &increment, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(inc2_dev, inc2, sizeof(int), cudaMemcpyHostToDevice);
//cudaMemcpy(mstart_dev, &my_start, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(size_dev, &size, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(num_threads_dev, &num_threads, sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
pcyc_red<<<1, num_threads>>>(a_dev, b_dev, c_dev, D_dev, X_dev,
a2_dev, b2_dev, c2_dev, D2_dev,
inc2_dev, size_dev, num_threads_dev);
cudaDeviceSynchronize();
cudaMemcpy(X, X_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(c, c_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(D, D_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(inc2, inc2_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&result_var, num_threads_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&total_time, start, stop);
printf("Final Var: %d\n\n", inc2[0]);
printf("Num Threads Var: %d\n\n", result_var);
for(i = 0; i < size; ++i){
printf("a=%8.4f \n", a[i]);
printf("b=%8.4f \n", b[i]);
printf("c=%8.4f \n", c[i]);
printf("D=%8.4f \n", D[i]);
printf("X=%8.4f \n", X[i]);
}
printf("Threads = %d, matrix_size = %d, time = %f\n",
num_threads, size, total_time);
cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
cudaFree(D_dev);
cudaFree(X_dev);
//cudaFree(inc_dev);
cudaFree(inc2_dev);
//cudaFree(mstart_dev);
//cudaFree(size_dev);
//cudaFree(num_threads_dev);
}//end of main
答案 0 :(得分:4)
将proper cuda error checking添加到您的代码中。
我可以看到的一个问题是您的分配大小与您的数组大小不匹配。仅举几个例子:
int size_array = (size) * sizeof(float);
...
cudaMalloc((void **) &b_dev, size); // size should probably be size_array here
... ^^^^
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice); // this won't work, will throw error
^^^^^^^^^^
以上肯定是一个错误,代码中有几种类型。您可能还会遇到机器配置问题(未正确安装CUDA等),错误检查也会指示该问题。