代码编译时没有错误,但调试时会出现访问冲突消息。任何人都可以指出下面的代码有什么问题吗?
我的代码实际上运行了1000次相同等式的1000次迭代 它是一个递归非线性方程。目的只是为了欣赏这种能力 并行运行多个(迭代)方程式。
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <conio.h>
#include <cuda.h>
#include <cutil.h>
#include <time.h>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define TOTAL_BLOCKS 4
#define VALUES_PER_THREAD 1000
#define THETA_VALUES_PER_THREAD 15
__global__ void my_compute(float *y_d, float *theta_d, float *u_d)
{
int offset = ((blockIdx.x * blockDim.x) + threadIdx.x) * VALUES_PER_THREAD;
int theta_offset = ((blockIdx.x * blockDim.x) + threadIdx.x) * THETA_VALUES_PER_THREAD;
for (int i = 7; i < 1000; i++) {
y_d[offset + i] = theta_d[theta_offset + 0] * y_d[offset + i - 1] +
theta_d[theta_offset + 1] * y_d[offset + i - 3] +
theta_d[theta_offset + 2] * u_d[offset + i - 5] * u_d[offset + i - 4] +
theta_d[theta_offset + 3] +
theta_d[theta_offset + 4] * u_d[offset + i - 6] +
theta_d[theta_offset + 5] * u_d[offset + i - 4] * y_d[offset + i - 6] +
theta_d[theta_offset + 6] * u_d[offset + i - 7] +
theta_d[theta_offset + 7] * u_d[offset + i - 7] * u_d[offset + i - 6] +
theta_d[theta_offset + 8] * y_d[offset + i - 4] +
theta_d[theta_offset + 9] * y_d[offset + i - 5] +
theta_d[theta_offset + 10] * u_d[offset + i - 4] * y_d[offset + i - 5] +
theta_d[theta_offset + 11] * u_d[offset + i - 4] * y_d[offset + i - 2] +
theta_d[theta_offset + 12] * u_d[offset + i - 7] * u_d[offset + i - 3] +
theta_d[theta_offset + 13] * u_d[offset + i - 5] +
theta_d[theta_offset + 14] * u_d[offset + i - 4];
}
}
int main(void)
{
float y[1000000];
FILE * fpoo;
FILE * u;
float theta[15000];
float u_data[1000000];
float *y_d;
float *theta_d;
float *u_d;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//memory allocation
cudaMalloc((void **) &y_d, 1000000 * sizeof(float));
cudaMalloc((void **) &theta_d, 15000 * sizeof(float));
cudaMalloc((void **) &u_d, 1000000 * sizeof(float));
cudaEventRecord(start, 0);
// importing data for theta and input of model//
fpoo = fopen("c:\\Fly_theta.txt", "r");
u = fopen("c:\\Fly_u.txt", "r");
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 15; j++)
fscanf(fpoo, "%f\n", &theta[15 * i + j]);
}
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 1000; j++)
fscanf(u, "%f\n", &u_data[1000 * i + j]);
}
//initialising past input with the value of zero//
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 8; j++)
y[8 * i + j] = 0;
}
cudaMemcpy(y_d, y, 1000000 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(theta_d, theta, 15000 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(u_d, u_data, 1000000 * sizeof(float), cudaMemcpyHostToDevice);
//calling kernel function//
my_compute <<< 4, 256 >>> (y_d, theta_d, u_d);
cudaMemcpy(y, y_d, 1000000 * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 1000; j++)
printf("%f", y[1000 * i + j]);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Time to generate: %3.1f ms \n", time);
cudaFree(y_d);
cudaFree(theta_d);
cudaFree(u_d);
fclose(u);
fclose(fpoo);
//fclose();
_getche();
return 0;
}
答案 0 :(得分:1)
您可能有堆栈溢出。
尝试在数组声明中添加static
说明符:
static float y[1000000];
static float theta[15000];
static float u_data[1000000];
答案 1 :(得分:1)
我尝试使用集成了cuda-memcheck的Toolkit 5.0中的cuda-gdb进行编码。
它向我展示的是在块(3,0,0)的线程(232,0,0)上,theta_offset是15000 - 并且从我对你的代码的预期看起来它应该永远不会超过14999( theta_d size是15000,这是可以使用的最大索引)
注意4个块* 256个线程* 15个元素/线程= 15360。
答案 2 :(得分:1)
theta_d的字节大小应为:
THREADS_PER_BLOCK * TOTAL_BLOCKS * THETA_VALUES_PER_THREAD * ELEMENT_SIZE =
256 * 4 * 15 * 4 = 61440
确保您选择的访问模式可以访问该数组的所有元素。在应用程序中, theta_d声明为:
cudaMalloc((void **) &theta_d, 15000 * sizeof(float));
实际上,theta_d的大小只有60000字节。因此,内核将在超过字节60000的所有访问中遇到错误。
一种解决方案是根据已定义的常量调整分配大小。如果用“THREADS_PER_BLOCK * TOTAL_BLOCKS * THETA_VALUES_PER_THREAD”替换15000,它将正确调整分配大小。尝试以下主要功能:
int main(void)
{
float y[1000000];
FILE * fpoo;
FILE * u;
float theta[THREADS_PER_BLOCK * TOTAL_BLOCKS * THETA_VALUES_PER_THREAD];
float u_data[1000000];
float *y_d;
float *theta_d;
float *u_d;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//memory allocation
cudaMalloc((void **) &y_d, 1000000 * sizeof(float));
cudaMalloc((void **) &theta_d, THREADS_PER_BLOCK * TOTAL_BLOCKS * THETA_VALUES_PER_THREAD* sizeof(float));
cudaMalloc((void **) &u_d, 1000000 * sizeof(float));
cudaEventRecord(start, 0);
// importing data for theta and input of model//
fpoo = fopen("c:\\Fly_theta.txt", "r");
u = fopen("c:\\Fly_u.txt", "r");
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 15; j++)
fscanf(fpoo, "%f\n", &theta[15 * i + j]);
}
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 1000; j++)
fscanf(u, "%f\n", &u_data[1000 * i + j]);
}
//initialising past input with the value of zero//
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 8; j++)
y[8 * i + j] = 0;
}
cudaMemcpy(y_d, y, 1000000 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(theta_d, theta, THREADS_PER_BLOCK * TOTAL_BLOCKS * THETA_VALUES_PER_THREAD* sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(u_d, u_data, 1000000 * sizeof(float), cudaMemcpyHostToDevice);
//calling kernel function//
my_compute <<< 4, 256 >>> (y_d, theta_d, u_d);
cudaMemcpy(y, y_d, 1000000 * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 1000; i++) {
for (int j = 0; j < 1000; j++)
printf("%f", y[1000 * i + j]);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Time to generate: %3.1f ms \n", time);
cudaFree(y_d);
cudaFree(theta_d);
cudaFree(u_d);
fclose(u);
fclose(fpoo);
//fclose();
_getche();
return 0;
}
答案 3 :(得分:0)
守则确实是正确的。用于从文件读取数据的'fopen for loops'不是 按预期读取数据。所以我把它改成了下面的循环。我的代码现在 运行得很好。感谢所有贡献的人。
//导入theta的数据和模型的输入
fpoo= fopen("c:\\Fly_theta.txt","r");
u= fopen("c:\\Fly_u.txt","r");
for(int j=0;j<15;j++)
{
fscanf(fpoo,"%f\n",&theta[j]);
}
for(int i=1;i<1000;i++)
{
for(int j=0;j<15;j++)
{
theta[15*i+j]=theta[j];
}
}
for(int j=0;j<1000;j++)
{
fscanf(u,"%f\n",&u_data[j]);
}
for(int i=1;i<1000;i++)
{
for(int j=0;j<1000;j++)
{
u_data[1000*i+j]=u_data[j];
}
}