我有一个程序(现在)计算GPU上随机点中两个函数的值,将这些值发送回主机,然后将它们可视化。这就是我得到的,一些不错的半随机点: 现在,如果我修改我的内核代码,并在最后添加本地数组初始化代码,
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
//works fine with 'new'
//float* test_array=new float[2];
float test_array[2]={1.0f,2.0f};
}
我每次都得到这样的东西:
有谁知道这种行为的原因?在初始化test_array之前计算所有绘制的点,但它们受其影响。当我在&#39;之前初始化test_array时,它不会发生。循环。
主机/设备代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand_kernel.h"
#include "device_functions.h"
#include <random>
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
#define XSIZE 5
#define TEST_POINTS 100
#define NOF 2
#define BLOCK_COUNT 64
#define THR_COUNT 128
#define POINTS_PER_THREAD (NOF*TEST_POINTS+THR_COUNT*BLOCK_COUNT-1)/(THR_COUNT*BLOCK_COUNT)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=false)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ float g(float* x)
{
float tmp=1;
for(int i=1;i<XSIZE;i++)
tmp*=x[i];
return 1+9*(tmp/(XSIZE-1));
}
__device__ float ZDT_f1(float* x)
{
return x[0];
}
__device__ float ZDT_f2(float* x)
{
float gp=g(x);
return gp*(1-sqrtf(x[0]/gp));
}
__device__ bool oneDominatesTwo(float* x1, float* x2)
{
for(int i=0;i<XSIZE;i++)
if(x1[i]>=x2[i])
return false;
return true;
}
__device__ float* generateX(curandState* globalState)
{
int ind = threadIdx.x;
float x[XSIZE];
for(int i=0;i<XSIZE;i++)
x[i]=curand_uniform(&globalState[ind]);
return x;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = blockDim.x*blockIdx.x+threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
__syncthreads();
//float* test_array=new float[2];
//test_array[0]=1.0f;
//test_array[1]=1.0f;
float test_array[2]={1.0f,1.0f};
}
void saveResultToFile(float* result)
{
ofstream resultFile;
resultFile.open ("result.txt");
for(unsigned int i=0;i<NOF*TEST_POINTS;i+=NOF)
{
resultFile << result[i] << " "<<result[i+1]<<"\n";
}
resultFile.close();
}
int main()
{
float* dev_fPoints;
float* fPoints=new float[NOF*TEST_POINTS];
gpuErrchk(cudaMalloc((void**)&dev_fPoints, NOF * TEST_POINTS * sizeof(float)));
curandState* devStates;
gpuErrchk(cudaMalloc(&devStates,THR_COUNT*sizeof(curandState)));
cudaEvent_t start;
gpuErrchk(cudaEventCreate(&start));
cudaEvent_t stop;
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaThreadSetLimit(cudaLimitMallocHeapSize, 128*1024*1024));
gpuErrchk(cudaEventRecord(start, NULL));
setup_kernel<<<BLOCK_COUNT, THR_COUNT>>>(devStates,unsigned(time(NULL)));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
optymalize<<<BLOCK_COUNT,THR_COUNT>>>(devStates, dev_fPoints);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(fPoints, dev_fPoints, NOF * TEST_POINTS * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaEventRecord(stop, NULL));
gpuErrchk(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
cout<<"Kernel execution time: "<<msecTotal<< "ms"<<endl;
saveResultToFile(fPoints);
system("start pythonw plot_data.py result.txt");
cudaFree(dev_fPoints);
cudaFree(devStates);
system("pause");
return 0;
}
绘制脚本代码:
import matplotlib.pyplot as plt;
import sys;
if len(sys.argv)<2:
print("Usage: python PlotScript <filename>");
sys.exit(0);
path=sys.argv[1];
x=[]
y=[]
with open(path,"r") as f:
for line in f:
vals=line.strip().split(" ");
x.append(vals[0]);
y.append(vals[1]);
plt.plot(x,y,'ro')
plt.show();
答案 0 :(得分:2)
基本问题在于您最初未在问题中显示的代码,特别是:
__device__ float* generateX(curandState* globalState)
{
int ind = threadIdx.x;
float x[XSIZE];
for(int i=0;i<XSIZE;i++)
x[i]=curand_uniform(&globalState[ind]);
return x;
}
从函数返回对本地范围变量的地址或引用会导致未定义的行为。仅在x
范围内使用generateX
作为参考或值才有效。毫无疑问,在内核中添加或移动其他局部范围变量会改变内核行为。
修复此函数,使其填充通过引用传递的数组,而不是返回本地范围数组的地址。并注意编译器警告 - 这将有一个应该立即引起警告的错误。