#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define SZ_INT sizeof(int)
#define CELL_SZ 1
#define CELL_VALUE(a,x) (((a) << 1) | x)
#define FROM(a) ((a) & 1)
#define LENGTH(a) ((a) >> 1)
#define INDEX(i,j,m) ((i) * (m + 1) + j)
//FROM: 1 if L[i][j] took value from L[i - 1][j], 0 if L[i][j] took value from L[i][j - 1]
#define CUDA_CHECK_ERROR(err) \
if (err != cudaSuccess) { \
printf("Cuda error: %s\n", cudaGetErrorString(err)); \
printf("Error in file: %s, line: %i\n", __FILE__, __LINE__); \
}
__global__ void Find_L_entry (int *L, int *A, int n, int *B, int m, int diag) {
int j = threadIdx.x + blockIdx.x * blockDim.x;
int i = diag - j;
if (i >= 0 && i < n && j >= 0 && j < m) {
if (A[i] == B[j]) {
L[INDEX(i, j, m)] = CELL_VALUE(LENGTH(L[INDEX(i - 1, j - 1, m)]) + 1, 0);
} else {
L[INDEX(i, j, m)] = (LENGTH(L[INDEX(i - 1, j, m)]) > LENGTH(L[INDEX(i, j - 1, m)])) ?
CELL_VALUE(LENGTH(L[INDEX(i - 1, j, m)]), 1) :
CELL_VALUE(LENGTH(L[INDEX(i, j - 1, m)]), 0);
}
}
}
__host__ void output_sequence(int *L, int *A, int n, int *B, int m) {
int len = LENGTH(L[INDEX(n - 1, m - 1, m)]);
int i = n - 1, j = m - 1;
int *lcs = (int*) malloc(len * SZ_INT);
int top = 0;
while (i >= 0 && j >= 0) {
if (A[i] == B[j]) {
lcs[top++] = A[i];
i--; j--;
} else {
if (FROM(L[INDEX(i, j, m)]) == 1)
i--;
else
j--;
}
}
printf("Length: %d\nSequence: ", len);
for (int i = len - 1; i >= 0; i--) {
printf("%d%c", lcs[i], i ? ' ' : '\n');
}
free(lcs);
}
__host__ void read_sequence(int *&A, int &n, int num) {
printf("Enter number of elements in sequence %d\n", num);
scanf("%d", &n);
A = (int*) malloc(n * sizeof(int));
printf("Enter %d elements of sequence %d\n", n, num);
for (int i = 0; i < n; i++)
scanf("%d", A + i);
}
int main ( int argc, char **argv ) {
int number_of_blocks = atoi(argv[1]), threads_in_block = atoi(argv[2]);
int n, m;
int *A, *B;
read_sequence(A, n, 1);
read_sequence(B, m, 2);
int *d_A, *d_B;
cudaMalloc((void**)&d_A, n * SZ_INT);
cudaMalloc((void**)&d_B, m * SZ_INT);
CUDA_CHECK_ERROR(cudaMemcpy(d_A, A, n * SZ_INT, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_B, B, m * SZ_INT, cudaMemcpyHostToDevice));
int *big_L = (int*) malloc((n + 1) * (m + 1) * CELL_SZ * SZ_INT);
for (int i = 0; i < (n + 1) * (m + 1) * CELL_SZ; i++)
big_L[i] = 0;
int *L = &big_L[(m + 2) * CELL_SZ];
int *dev_L;
cudaMalloc((void**)&dev_L, (n + 1) * (m + 1) * SZ_INT);
int *d_L = &dev_L[(m + 2) * CELL_SZ];
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, (n * (m + 1) - 1) * SZ_INT, cudaMemcpyHostToDevice));
int diag_count = n + m - 1;
for (int diag = 0; diag < diag_count; diag++) {
CUDA_CHECK_ERROR(cudaMemcpy(d_L, L, SZ_INT, cudaMemcpyHostToDevice));
Find_L_entry<<<number_of_blocks, threads_in_block>>>(d_L, d_A, n, d_B, m, diag);
CUDA_CHECK_ERROR(cudaPeekAtLastError());
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
CUDA_CHECK_ERROR(cudaDeviceSynchronize());
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
printf("%d%c", L[INDEX(i,j,m)], j == m - 1 ? '\n' : ' ');
system("pause");
CUDA_CHECK_ERROR(cudaThreadSynchronize());
}
CUDA_CHECK_ERROR(cudaMemcpy(L, d_L, (n * (m + 1) - 1) * CELL_SZ * SZ_INT, cudaMemcpyDeviceToHost));
output_sequence(L, A, n, B, m);
cudaFree(d_L);
cudaFree(d_A);
cudaFree(d_B);
free(A); free(B); free(big_L);
return 0;
}
我正在通过cmd进行编译。
nvcc -g -G -arch=sm_21 -o lcs.exe lcs.cu
当我运行它时,我收到运行时错误:“Cuda错误:功能中的设备无效,第94行”
答案 0 :(得分:2)
您收到的运行时错误正在发生,因为运行时API无法找到或创建可在GPU上运行的代码。
根本原因是您正在编译与GPU不兼容的体系结构(计算能力2.1)的代码。你已经声明你有一个GT310M,你可以从here看到它是一个计算能力1.2设备。 CUDA工具链支持向后代码兼容性(即旧代码将在新设备上运行),但不是相反。
你应该像这样构建你的代码:
nvcc -g -G -arch=sm_12 -o lcs.exe lcs.cu