我遇到调用矩阵加法内核的问题,这会导致大于255x255的矩阵超时。有关详细信息,请使用-arch = sm_21进行编译并在MacBook Pro上运行。我尝试过使用不同大小的块和线程。
矩阵结构:
typedef struct {
int n, m; /* Define an n-rows by m-columns matrix */
double* data;
} c_matrix;
初始化矩阵和包装器以添加矩阵:
extern "C"
c_matrix *new_c_matrix(int i, int j) {
c_matrix *m = (c_matrix *)malloc(sizeof(*m));
if(m == NULL)
return NULL;
m->data = (double *)malloc(sizeof(double) * i * j);
if(m->data == NULL) {
free(m);
return NULL;
}
m->n = i;
m->m = j;
return m;
}
extern "C"
void c_matrix_add(const c_matrix *m1, const c_matrix *m2, c_matrix *m) {
/* We only need 4 comparisons because we can assume
* transitivity of ints */
if(m1->m != m2->m || m1->n != m2->n || m1->m != m->m
|| m1->n != m2->n)
exit(EXIT_FAILURE);
double *d_a, *d_b, *d_c;
handle_error( cudaMalloc(&d_a, m1->m * m1->n * sizeof(double)) );
handle_error( cudaMalloc(&d_b, m1->m * m1->n * sizeof(double)) );
handle_error( cudaMalloc(&d_c, m1->m * m1->n * sizeof(double)) );
handle_error( cudaMemcpy(d_a, m1->data, m1->m * m1->n * sizeof(double), cudaMemcpyHostToDevice ) );
handle_error( cudaMemcpy(d_b, m2->data, m2->m * m2->n * sizeof(double), cudaMemcpyHostToDevice ) );
dim3 dimBlock(16, 16);
dim3 dimGrid((m1->m + dimBlock.x - 1) / dimBlock.x, (m1->n + dimBlock.y - 1) / dimBlock.y);
cu_matrix_add<<< dimGrid, dimBlock >>>(d_a, d_b, d_c, m1->m * m1->n);
handle_error( cudaMemcpy(m->data, d_c, m->m * m->n * sizeof(double), cudaMemcpyDeviceToHost ) );
// cudaFree( d_c );
cudaFree( d_b );
cudaFree( d_a );
}
内核本身:
__global__ void cu_matrix_add(const double *d_a, const double *d_b, double *d_c, int element_count) {
unsigned short tid = blockIdx.x * blockDim.x + threadIdx.x;
while( tid < element_count ) {
d_c[tid] = d_a[tid] + d_b[tid];
tid += blockDim.x * gridDim.x;
}
}
当我尝试将设备矩阵复制回主机矩阵时,它在handle_error( cudaMemcpy(m->data, d_c, m->m * m->n * sizeof(double), cudaMemcpyDeviceToHost ) );
上超时。