在命令行上使用cuda-memcheck时出错

时间:2017-11-29 04:47:39

标签: c++ parallel-processing cuda

好吧所以当我不使用cuda-memcheck时,这个程序基本上计算出我对nPhotons的小值(~1000)所期望的,但是如果nPhotons变得太大(~100000),它很可能无法完成。我认为这可能是一个内存错误,所以我从命令行运行cuda-memcheck,它给了我很多错误。在内核调用之后,这些都发生在cudaMemcpy和cudaFree的每个实例上。

Program hit cudaErrorUnknown (error 30) due to "unknown error" on CUDA API call to cudaFree.
Program hit cudaErrorUnknown (error 30) due to "unknown error" on CUDA API call to cudaMemcpy.
Program hit CUDA_ERROR_UNKNOWN (error 999) due to "unknown error" on CUDA API call to cuModuleUnload.

所以我无法弄清楚这有什么问题或为什么程序在更高的nPhotons失败。我猜这是一个简单的错误,但我不知所措。完整的代码如下。

#include <iostream>
#include <fstream>
#include <cstdlib>
#include <ctime>
#include <math.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define PI 3.14159265
#define MAX 4294967296
using namespace std;

//Generate random numbers
__device__ float rando() {
    curandState_t state;
    curand_init(clock64(), threadIdx.x, 0, &state);
    return (float)curand(&state) / MAX;
}

//Initialization of photons
__device__ void new_photon(float *photon){

    float cosTh = rando();
    photon[0] = cosTh;
    float sinTh = sqrt(1.0f - cosTh * cosTh);
    photon[1] = sinTh;
    float phi = 2 * PI * (rando());
    photon[2] = phi;
    float cosPh = cos(phi);
    photon[3] = cosPh;
float sinPh = sin(phi);
photon[4] = sinPh;

//Initializing Position
float xTotal = 0;
photon[5] = xTotal;
float yTotal = 0;
photon[6] = yTotal;
float zTotal = 0;
photon[7] = zTotal;
}

//Isotropic Scattering of photon off of slab
__device__ void iso_scatt(float *photon){
float cosTh = (rando() * 2 - 1);
photon[0] = cosTh;
float sinTh = sqrt(1 - cosTh * cosTh);
photon[1] = sinTh;
float phi = 2 * PI * (rando());
photon[2] = phi;
float cosPh = cos(phi);
photon[3] = cosPh;
float sinPh = sin(phi);
photon[4] = sinPh;
}


//Moments
__device__ void moments(float x1, float y1, float z1, float x2, float y2,float z2, float cosTh, int nLevel,
float *jPlus, float *hPlus, float *kPlus, float *jMinus, float *kMinus, float *hMinus){
int l1, l2;
if (z1 > 0 && z2 > 0 && floor(z1 * nLevel) == floor(z2 * nLevel)){
    return;
}

if (cosTh > 0){
    if (z1 <= 0){
        l1 = 1;
    }
    else{
        l1 = z1 * nLevel + 2;
    }
    if (z2 >= 1){
        l2 = nLevel + 1;
    }
    else{
        l2 = z2 * nLevel + 1;
    }

    for (int n = l1 - 1; n < l2; n++){
        atomicAdd(&jPlus[n], 1.0f / cosTh);
        atomicAdd(&hPlus[n], 1.0f);
        atomicAdd(&kPlus[n], cosTh);
    }
}
else if (cosTh < 0){
    l1 = (z1 * nLevel) + 1;
    if (z2 < 0) {
        l2 = 1;
    }
    else {
        l2 = (z2 * nLevel) + 2;
    }
    for (int n = l2 - 1; n < l1; n++) {
        //__syncthreads();
        atomicAdd(&jMinus[n], 1.0f / (abs(cosTh)));
        atomicAdd(&hMinus[n], -1.0f);
        atomicAdd(&kMinus[n], abs(cosTh));

    }
}

}

//Generates photons and allows them to propagate from the origin
__global__ void work(float *hPlus, float *kPlus, float *jMinus, float *kMinus, float *hMinus,
float *jPlus, int muBins, float *energy, int nLevel, float *erri,
float tauMax, int albedo, int seed, int *test){

//initialization of photon
float photon[8] = { 0 };

//atomic adder test
atomicAdd(&test[0], 1);

//Generate a new photon when it's z2 position is less than 0
newP:
new_photon(photon);
int aFlag = 0;

while ((photon[7] >= 0) && (photon[7] <= 1)){
    float x1 = photon[5];
    float y1 = photon[6];
    float z1 = photon[7];
    float tau = -log(rando());
    float s = tau / tauMax;
    photon[5] = photon[5] + s*photon[1] * photon[3];
    photon[6] = photon[6] + s*photon[1] * photon[4];
    photon[7] = photon[7] + s*photon[0];
    float x2 = photon[5];
    float y2 = photon[6];
    float z2 = photon[7];
    moments(x1, y1, z1, x2, y2, z2, photon[0], nLevel,
        jPlus, hPlus, kPlus, jMinus, kMinus, hMinus);
    if ((photon[7] < 0) || (photon[7] > 1))
    {
        continue;
    }
    if (rando()< albedo)
    {
        iso_scatt(photon);
    }
    else{
    aFlag = 1;
    continue;
    }
}
if (photon[7] < 0){
    goto newP;
}

if (aFlag == 0){
    int l = int(muBins*photon[0]);
    atomicAdd(&erri[l], 1.0f);
    atomicAdd(&energy[l], 1.0f);
}
}

//Output 
void output(float hPlus[], float kPlus[], float jMinus[], float kMinus[],   float hMinus[], float jPlus[],
int nPhotons, int muBins, float intensity[], float energy[], float nLevel, float sigmai[], float theta[], float erri[])
{
//setting values for arrays
for (int n = 0; n < muBins; n++){
    intensity[n] = energy[n] / (2 * nPhotons*cos(theta[n] * PI / 180))*muBins;
    sigmai[n] = sqrt(erri[n]) / nPhotons;
    energy[n] = energy[n] / nPhotons;
}

for (int n = 0; n < nLevel; n++){
    jPlus[n] = jPlus[n] / nPhotons;
    jMinus[n] = jMinus[n] / nPhotons;
    hPlus[n] = hPlus[n] / nPhotons;
    hMinus[n] = hMinus[n] / nPhotons;
    kPlus[n] = kPlus[n] / nPhotons;
    kMinus[n] = kMinus[n] / nPhotons;
}
// write output to file: "intensity.dat"
ofstream file;
file.open("intensity.dat");
for (int i = muBins - 1; i > -1; i--) {
    file << theta[i] << "\t" << energy[i] << "\t" << sigmai[i] << "\t" << intensity[i] << "\n";
}
file.close();

// write output to file: "moments.dat"
ofstream file2;
file2.open("moments.dat");
for (int i = 0; i <= nLevel; i++) {
    file2 << jPlus[i] << "\t" << jMinus[i] << "\t" << hPlus[i] << "\t" << hMinus[i] << "\t" << kPlus[i] << "\t" << kMinus[i] << "\n";
}
file2.close();
}

int main()
{
//Follow particles through simulation
//Call all functions
//Read in parameter file

//creation of host variables
int numPhotons;
cout << "Give an nPhotons value: " << endl;
cin >> numPhotons;
const int nPhotons = numPhotons;
const int muBins = 10;
const int nmu = 20;
int nLevel = 10;
const int nLev = 21;
float tauMax = 10;
int albedo = 1;
float dTheta = float(1) / float(muBins);
float theta[muBins] = {0};
float halfW = 0.5 * dTheta;
float intensity[nmu] = { 0 };
float sigmai[nmu] = { 0 };
float *energy;
float *erri;
float *jPlus;
float *jMinus;
float *hPlus;
float *hMinus;
float *kPlus;
float *kMinus;
int *test;

//initialization of dtheta
for( int i = 0; i < muBins; i++){
    theta[i] = acos(float(i)*dTheta + halfW)* 180/PI;
}

//initialization of host memory
hPlus = (float*)malloc(nLev * sizeof(float));
jPlus = (float*)malloc(nLev * sizeof(float));
kPlus = (float*)malloc(nLev * sizeof(float));
hMinus = (float*)malloc(nLev * sizeof(float));
jMinus = (float*)malloc(nLev * sizeof(float));
kMinus = (float*)malloc(nLev * sizeof(float));
energy = (float*)malloc(nmu * sizeof(int));
erri = (float*)malloc(nmu * sizeof(int));
test = (int*)malloc(sizeof(int));

//Creation of device variables
float *d_hPlus;
float *d_kPlus;
float *d_jMinus;
float *d_kMinus;
float *d_hMinus;
float *d_jPlus;
float *d_energy;
float *d_erri;
int *d_test;

//Initialization of device memory
cudaMalloc((void**)&d_hPlus, nLev * sizeof(float));
cudaMalloc((void**)&d_jPlus, nLev * sizeof(float));
cudaMalloc((void**)&d_kPlus, nLev * sizeof(float));
cudaMalloc((void**)&d_hMinus, nLev * sizeof(float));
cudaMalloc((void**)&d_jMinus, nLev * sizeof(float));
cudaMalloc((void**)&d_kMinus, nLev * sizeof(float));
cudaMalloc((void**)&d_energy, nmu * sizeof(float));
cudaMalloc((void**)&d_erri, nmu * sizeof(float));
cudaMalloc((void**)&d_test, sizeof(int));

//Setting device memory to 0
cudaMemset(d_hPlus, 0, nLev * sizeof(float));
cudaMemset(d_jPlus, 0, nLev * sizeof(float));
cudaMemset(d_kPlus, 0, nLev * sizeof(float));
cudaMemset(d_hMinus, 0, nLev * sizeof(float));
cudaMemset(d_jMinus, 0, nLev * sizeof(float));
cudaMemset(d_kMinus, 0, nLev * sizeof(float));
cudaMemset(d_energy, 0, nmu * sizeof(float));
cudaMemset(d_erri, 0, nmu * sizeof(float));
cudaMemset(d_test, 0, sizeof(int));

int seed = static_cast<unsigned int>(time(0));

work << <nPhotons / 512, 512 >> >(d_hPlus, d_kPlus, d_jMinus, d_kMinus, d_hMinus, d_jPlus,
    muBins, d_energy, nLevel, d_erri, tauMax, albedo, seed, d_test);

cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
    printf("Error: %s\n", cudaGetErrorString(err));

cudaMemcpy(hPlus, d_hPlus, nLev * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(jPlus, d_jPlus, nLev * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(kPlus, d_kPlus, nLev * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(hMinus, d_hMinus, nLev * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(jMinus, d_jMinus, nLev * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(kMinus, d_kMinus, nLev * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(energy, d_energy, nmu * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(erri, d_erri, nmu * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(test, d_test, sizeof(int), cudaMemcpyDeviceToHost);

for (int i = 0; i < 10; i++) {
    cout << erri[i] << endl;
}
printf("Test is %d", test[0]);

output(hPlus, kPlus, jMinus, kMinus, hMinus, jPlus, nPhotons, muBins, intensity, energy, nLevel, sigmai, theta, erri);
cudaFree(d_hPlus);
cudaFree(d_jPlus);
cudaFree(d_kPlus);
cudaFree(d_hMinus);
cudaFree(d_jMinus);
cudaFree(d_kMinus);
cudaFree(d_energy);
cudaFree(d_erri);
cudaFree(d_test);

return 0;
}

0 个答案:

没有答案