Question

我有一个程序（现在）计算GPU上随机点中两个函数的值，将这些值发送回主机，然后将它们可视化。这就是我得到的，一些不错的半随机点： random points 现在，如果我修改我的内核代码，并在最后添加本地数组初始化代码，

__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x; 

for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{   
    float* x=generateX(state);
    testPoints[i]=ZDT_f1(x);
    testPoints[i+1]=ZDT_f2(x);
}
//works fine with 'new'
//float* test_array=new float[2];
float test_array[2]={1.0f,2.0f};    
}

我每次都得到这样的东西： not-so-random

有谁知道这种行为的原因？在初始化test_array之前计算所有绘制的点，但它们受其影响。当我在＆＃39;之前初始化test_array时，它不会发生。循环。

主机/设备代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand_kernel.h"  
#include "device_functions.h"

#include <random>
#include <iostream>
#include <time.h>
#include <fstream>

using namespace std;

#define XSIZE 5
#define TEST_POINTS 100
#define NOF 2
#define BLOCK_COUNT 64
#define THR_COUNT 128
#define POINTS_PER_THREAD (NOF*TEST_POINTS+THR_COUNT*BLOCK_COUNT-1)/(THR_COUNT*BLOCK_COUNT)

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=false)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__device__ float g(float* x)
{
    float tmp=1;
    for(int i=1;i<XSIZE;i++)
        tmp*=x[i];

    return 1+9*(tmp/(XSIZE-1));
}

__device__ float ZDT_f1(float* x)
{
    return x[0];
}

__device__ float ZDT_f2(float* x)
{
    float gp=g(x);
    return gp*(1-sqrtf(x[0]/gp));
}


__device__ bool oneDominatesTwo(float* x1, float* x2)
{
    for(int i=0;i<XSIZE;i++)
        if(x1[i]>=x2[i])
            return false;

    return true;
}

__device__ float* generateX(curandState* globalState)
{

    int ind = threadIdx.x;
    float x[XSIZE];
    for(int i=0;i<XSIZE;i++)
        x[i]=curand_uniform(&globalState[ind]);

    return x;
}

__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
    int id = blockDim.x*blockIdx.x+threadIdx.x;
    curand_init ( seed, id, 0, &state[id] );
}

__global__ void optymalize(curandState * state, float* testPoints)
{
    int ind=blockDim.x*blockIdx.x+threadIdx.x;
    int step=blockDim.x*gridDim.x; 

    for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
    {   
        float* x=generateX(state);
        testPoints[i]=ZDT_f1(x);
        testPoints[i+1]=ZDT_f2(x);
    }

    __syncthreads();
    //float* test_array=new float[2];
    //test_array[0]=1.0f;
    //test_array[1]=1.0f;

    float test_array[2]={1.0f,1.0f};    
}
void saveResultToFile(float* result)
{
    ofstream resultFile;
    resultFile.open ("result.txt");
    for(unsigned int i=0;i<NOF*TEST_POINTS;i+=NOF)
    {
        resultFile << result[i] << " "<<result[i+1]<<"\n";
    }
    resultFile.close();
}

int main()
{
    float* dev_fPoints;
    float* fPoints=new float[NOF*TEST_POINTS];
    gpuErrchk(cudaMalloc((void**)&dev_fPoints, NOF * TEST_POINTS * sizeof(float)));

    curandState* devStates;
    gpuErrchk(cudaMalloc(&devStates,THR_COUNT*sizeof(curandState)));

    cudaEvent_t start;
    gpuErrchk(cudaEventCreate(&start));
    cudaEvent_t stop;
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaThreadSetLimit(cudaLimitMallocHeapSize, 128*1024*1024));
    gpuErrchk(cudaEventRecord(start, NULL));
    setup_kernel<<<BLOCK_COUNT, THR_COUNT>>>(devStates,unsigned(time(NULL)));
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaGetLastError());

    optymalize<<<BLOCK_COUNT,THR_COUNT>>>(devStates, dev_fPoints);
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaGetLastError());

    gpuErrchk(cudaMemcpy(fPoints, dev_fPoints, NOF * TEST_POINTS * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaEventRecord(stop, NULL));
    gpuErrchk(cudaEventSynchronize(stop));
    float msecTotal = 0.0f;
    cudaEventElapsedTime(&msecTotal, start, stop);


    cout<<"Kernel execution time: "<<msecTotal<< "ms"<<endl;
    saveResultToFile(fPoints);
    system("start pythonw  plot_data.py result.txt");

    cudaFree(dev_fPoints);
    cudaFree(devStates);
    system("pause");
    return 0;
}

绘制脚本代码：

import matplotlib.pyplot as plt;
import sys;

if len(sys.argv)<2:
    print("Usage: python PlotScript <filename>");
    sys.exit(0);

path=sys.argv[1];
x=[]
y=[]
with open(path,"r") as f:
    for line in f:
        vals=line.strip().split(" ");
        x.append(vals[0]);
        y.append(vals[1]);

plt.plot(x,y,'ro')
plt.show();

Answer 1

基本问题在于您最初未在问题中显示的代码，特别是：

__device__ float* generateX(curandState* globalState)
{

    int ind = threadIdx.x;
    float x[XSIZE];
    for(int i=0;i<XSIZE;i++)
        x[i]=curand_uniform(&globalState[ind]);

    return x;
}

从函数返回对本地范围变量的地址或引用会导致未定义的行为。仅在x范围内使用generateX作为参考或值才有效。毫无疑问，在内核中添加或移动其他局部范围变量会改变内核行为。

修复此函数，使其填充通过引用传递的数组，而不是返回本地范围数组的地址。并注意编译器警告 - 这将有一个应该立即引起警告的错误。

CUDA本地数组初始化修改程序输出

1 个答案: