我遇到了CUDA中的代码问题。代码编译正确但在程序完全运行时会产生意外的输出。
在这种情况下,步骤应该增加,直到达到stepcount。但是,只输出一步。究竟我做错了什么?
另外,如何引用特定的xcord,ycord或zcord。例如。在CPU代码中使用数组我可以通过xcord [1]来引用一个特定的元素。在CUDA的情况下,我使用xcord [threadidx.x]吗?
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
/* Kernal code */
__global__
void run(float *lvelox, float *lveloy, float *lveloz, float *xcord, float *ycord, float *zcord)
{
lveloy[threadIdx.x] = lveloy[threadIdx.x] - 9.81;
xcord[threadIdx.x] = xcord[threadIdx.x] + lvelox[threadIdx.x];
ycord[threadIdx.x] = ycord[threadIdx.x] + lveloy[threadIdx.x];
zcord[threadIdx.x] = zcord[threadIdx.x] + lveloz[threadIdx.x];
}
/* Host code */
int main(void) {
FILE *ofp;
char *mode = "r";
char outputFilename[] = "Output.txt";
float dlvelox;
float dlveloy;
float dlveloz;
float lvelox[1000] = {};
float lveloy[1000] = {};
float lveloz[1000] = {};
float xcord[1000] = {};
float ycord[1000] = {};
float zcord[1000] = {};
int fp;
int id;
int stepcount;
int step = 0;
{
ofp = fopen(outputFilename, "w");
if (ofp == NULL) {
fprintf(stderr, "Can't open output file %s!\n", outputFilename);
exit(1);
}
/* Obtaining velocity */
{
printf("Enter the initial horizontal velocity of the balls:\n");
scanf("%f", &dlvelox);
fprintf(ofp, "Initial horizontal velocity: %f\n", dlvelox);
printf("Enter the initial vertical velocity of the balls:\n");
scanf("%f", &dlveloy);
fprintf(ofp, "Initial vertical velocity: %f\n", dlveloy);
printf("Enter the initial Z velocity of the balls:\n");
scanf("%f", &dlveloz);
fprintf(ofp, "Initial Z velocity: %f\n", dlveloz);
}
for (int i = 0; i < 1000; i++)
lvelox[i] = dlvelox;
for (int i = 0; i < 1000; i++)
lveloy[i] = dlveloy;
for (int i = 0; i < 1000; i++)
lveloz[i] = dlveloz;
/* Obtain number of steps */
{
printf("Enter the number of steps wanted:\n");
scanf("%d", &stepcount);
fprintf(ofp, "Number of steps: %d\n", stepcount);
}
/* Initial console display */
{
fprintf(ofp, "\n");
fprintf(ofp, "X-cord, Y-cord, Z-cord, Horizontal Velo, Vertical Velo, Z Velo, Ball ID, Step\n");
fprintf(ofp, "\n");
}
/* GPU setup */
float *lveloxd;
float *lveloyd;
float *lvelozd;
float *xcordd;
float *ycordd;
float *zcordd;
int *stepd;
const int fsize = 1000*sizeof(float);
const int isize = 1000*sizeof(int);
/* Loop method */
while ( step < stepcount )
{
/* Memory allocation and copying to GPU */
cudaMalloc( (void**)&lveloxd, fsize );
cudaMalloc( (void**)&lveloyd, fsize );
cudaMalloc( (void**)&lvelozd, fsize );
cudaMalloc( (void**)&xcordd, fsize );
cudaMalloc( (void**)&ycordd, fsize );
cudaMalloc( (void**)&zcordd, fsize );
cudaMemcpy( lveloxd, lvelox, fsize, cudaMemcpyHostToDevice );
cudaMemcpy( lveloyd, lveloy, fsize, cudaMemcpyHostToDevice );
cudaMemcpy( lvelozd, lveloz, fsize, cudaMemcpyHostToDevice );
cudaMemcpy( xcordd, xcord, fsize, cudaMemcpyHostToDevice );
cudaMemcpy( ycordd, ycord, fsize, cudaMemcpyHostToDevice );
cudaMemcpy( zcordd, zcord, fsize, cudaMemcpyHostToDevice );
/* Perform ACTUAL LOOP */
dim3 dimBlock( 1000 );
dim3 dimGrid ( 1 );
run<<<dimGrid, dimBlock>>>(lveloxd, lveloyd, lvelozd, xcordd, ycordd, zcordd);
/* Copy back the data */
cudaMemcpy( lvelox, lveloxd, fsize, cudaMemcpyDeviceToHost );
cudaMemcpy( lveloy, lveloyd, fsize, cudaMemcpyDeviceToHost );
cudaMemcpy( lveloz, lvelozd, fsize, cudaMemcpyDeviceToHost );
cudaMemcpy( xcord, xcordd, fsize, cudaMemcpyDeviceToHost );
cudaMemcpy( ycord, ycordd, fsize, cudaMemcpyDeviceToHost );
cudaMemcpy( zcord, zcordd, fsize, cudaMemcpyDeviceToHost );
cudaFree( lveloxd );
cudaFree( lveloyd );
cudaFree( lvelozd );
cudaFree( xcordd );
cudaFree( ycordd );
cudaFree( zcordd );
fprintf(ofp, "%f, %f, %f, %f, %f, %f, %d\n", xcord, ycord, zcord, lvelox, lveloy, lveloz, step);
step = step + 1;
if ( step == stepcount )
{
return 0;
}
}
fclose(ofp);
}
}
答案 0 :(得分:0)
如果是由于内核中的SEGFAULT
,则可能的答案如下:
您正在分配1000个实体的数组,并启动一个包含1000个线程的块。由于cuda的粒度为32个线程(warpsize)而你需要1000个线程,因此将启动1024个线程作为ceil(1000/32)= 32个warps = 1024个线程。
现在因为你只有1000个元素的数组,你没有请求但产生的24个线程将访问外部分配的内存,导致SEGFAULT
,即lveloy[threadIdx.x]
其中threadIdx.x = 1000-1023
。
为了防止这种情况,请将其包括在内:
if(threadIdx.x < 1000) {
lveloy[threadIdx.x] = lveloy[threadIdx.x] - 9.81;
xcord[threadIdx.x] = xcord[threadIdx.x] + lvelox[threadIdx.x];
ycord[threadIdx.x] = ycord[threadIdx.x] + lveloy[threadIdx.x];
zcord[threadIdx.x] = zcord[threadIdx.x] + lveloz[threadIdx.x];
}
另外,如何引用特定的xcord,ycord或zcord。 例如。在CPU代码中使用数组我可以引用一个特定的元素 xcord [1]。在CUDA的情况下,我使用xcord [threadidx.x]吗?
您可以像在CPU上一样访问它,通过文字xcord [1]或依赖于线程xcord [threadidx.x]或两者都作为xcord [threadidx.x + 1](不推荐由于非合并的记忆)