我正在尝试使用CURAND函数以随机方式初始化神经网络的权重(存储为浮点数)。
我首先用一些值初始化神经网络,之后我尝试复制nn结构中的两个矩阵(nn代表神经网络),它应该存储权重值(nn.wih和nn.who)进入设备存储器。
然后我调用一个函数,该函数应该随机化矩阵的值(assignRandomWeight),它会启动两个包含curand函数的内核。
最后,我尝试通过cudaMemcpy调用将生成的矩阵复制回主机内存,但此时我收到错误“遇到非法内存访问”。
我试图打印wih和who的设备复制矩阵的值,它们是d_wih和d_who。他们似乎是正确的;我在代码中留下了两个对调试有用的函数:
可以调用checkCudaError来检查最后一个cudaError_t字符串消息
showValues可用于打印Device allcated arraay的值
我提取了一些编译并呈现相同错误的代码示例,请帮助我
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include<cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct TNeuralNetwork {
int input_neurons;
int hidden_neurons;
int output_neurons;
float *wih; //first layer of weights (from input layer to hidden layer)
float *who; //second layer of weights (from hidden layer to output layer)
float *wih_old; //for the momentum
float *who_old; //for the momentum
float *erro;
float *errh;
float l; //learning rate
float m; //momentum
float *i; //values into input neurons
float *h; //values into hidden neurons
float *o; //values into output neurons
};
__host__ void checkCudaError(char *str);
__global__ void showValues(float *d_v, int dim);
__global__ void init_rand(unsigned int seed, curandState_t state_wih);
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out);
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who);
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel);
int main(int argc, char **argv) {
struct TNeuralNetwork nn;
//Declare Device variables
float *d_wih;
float *d_who;
unsigned int v;
cudaError_t cudaStatus;
initNeuralNetwork(&nn, 102, 10);
//Allocate Device Memory
v = (nn.input_neurons + 1)*(nn.hidden_neurons);
cudaMalloc((void**)&d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float));
checkCudaError("malloc1");
//cudaMalloc((void**)&d_who, (nn.hidden_neurons + 1)*nn.output_neurons * sizeof(float));
//checkCudaError("malloc2");
for (int i = 0; i < (nn.input_neurons + 1); i++){
for (int j = 0; j < nn.hidden_neurons; j++){
nn.wih[i*nn.hidden_neurons + j] = 0;
}
}
for (int i = 0; i < (nn.hidden_neurons + 1); i++){
for (int j = 0; j < nn.output_neurons; j++){
nn.who[i*nn.output_neurons + j] = 0;
}
}
cudaMemcpy(d_wih, nn.wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError("memcpy0");
//showValues << <v, 1 >> >(d_wih, v); TEST
//cudaMemcpy(d_who, nn.who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyHostToDevice);
//checkCudaError("memcpy0.1");
assignRandomWeight(&nn, d_wih, d_who);
cudaMemcpy(nn.wih, d_wih, (nn.input_neurons + 1)*(nn.hidden_neurons)*sizeof(float), cudaMemcpyDeviceToHost);
//showValues << <v, 1 >> >(d_wih, v); TEST
checkCudaError("memcpy1");
//cudaMemcpy(nn.who, d_who, (nn.hidden_neurons + 1)*nn.output_neurons*sizeof(float), cudaMemcpyDeviceToHost);
//checkCudaError("memcpy2");
//printf("WIH:\n");
//for (int i = 0; i < (nn.input_neurons + 1); i++){
// for (int j = 0; j < (nn.hidden_neurons); j++){
// printf("%.12f\t", nn.wih[i*(nn.hidden_neurons) + j]);
// }
// printf("\n\n");
//}
//printf("WHO:\n");
//for (int i = 0; i < (nn.hidden_neurons + 1); i++){
// for (int j = 0; j < nn.output_neurons; j++){
// printf("%.12f\t", nn.wih[i*nn.output_neurons + j]);
// }
// printf("\n\n");
//}
cudaFree(d_wih);
cudaFree(d_who);
return 0;
}
__host__ void checkCudaError(char *str){
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
printf("Cuda Error at %s: %s \n", str, cudaGetErrorString(err));
exit(-1);
}
}
__global__ void showValues(float *d_v, int dim){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < dim){
printf("elemento[%d] = %.4f\n", tid, d_v[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, 0, tid, &state_wih);
}
__global__ void generateRandomValues(curandState_t state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
printf("%.7f", (float)curand(&state_wih + tid));
if (tid <= (inp + 1)*hid){
wih[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", wih[tid]);
}
if (tid <= (hid + 1)*out){
who[tid] = (float)curand_uniform(&state_wih + tid);
printf("%.7f", who[tid]);
}
}
void initNeuralNetwork(struct TNeuralNetwork *nn, int bands, int nlabel) {
nn->input_neurons = bands;
nn->output_neurons = nlabel;
//nn->hidden_neurons = (int)((bands + nlabel)/2.0f);
nn->hidden_neurons = (int)((bands + nlabel)*2.0f / 3.0f);
nn->l = 0.001;
nn->m = 0.2;
nn->wih = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->wih_old = (float*)malloc((bands + 1)*(nn->hidden_neurons) * sizeof(float)); //+1 for the bias
nn->who_old = (float*)malloc((nn->hidden_neurons + 1)*nlabel * sizeof(float));//+1 for the bias
nn->i = (float*)malloc(bands * sizeof(float));
nn->h = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->o = (float*)malloc(nlabel * sizeof(float));
nn->errh = (float*)malloc(nn->hidden_neurons * sizeof(float));
nn->erro = (float*)malloc(nlabel * sizeof(float));
memset(nn->wih_old, 0, (bands + 1)*(nn->hidden_neurons) * sizeof(float));
memset(nn->who_old, 0, (nn->hidden_neurons + 1)*nlabel * sizeof(float));
}
//curand
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
答案 0 :(得分:1)
“不正确的索引”将在内核中产生越界内存访问。 CUDA运行时将在内核中发生错误的位置销毁您的上下文,之后不能执行依赖于上下文的CUDA操作。 cudaMemcpy
调用失败,因为您的上下文已被破坏。没有办法避免这种情况。
NVIDIA使用CUDA工具包提供名为cuda-memcheck的实用程序。而是用它来诊断你的内核出了什么问题。
答案 1 :(得分:0)
我发现了我的错误:
我在assignRandomWight函数中使用了“curandState_t”类型变量,我不得不使用指针。
这是正确的版本:
void assignRandomWeight(struct TNeuralNetwork *nn, float *d_wih, float *d_who) {
cudaError_t cudaStatus;
curandState_t *state_wih;
srand(time(NULL));
unsigned int seed = rand();
//Alloco la matrice di curandState_t per la randomizzaione, in uscita dalla funzione non mi servirà più
cudaMalloc((void**)&state_wih, (nn->input_neurons + 1)*(nn->hidden_neurons)* sizeof(curandState_t));
dim3 gridSize(ceil((double)((nn->input_neurons + 1)*(nn->hidden_neurons)) / 32));
dim3 blockSize(32);
init_rand << < gridSize, blockSize >> >(seed, state_wih);
generateRandomValues << < gridSize, blockSize >> >(state_wih, d_wih, d_who, nn->input_neurons, nn->hidden_neurons, nn->output_neurons);
}
以及两个内核的正确版本:
__global__ void generateRandomValues( curandState_t *state_wih, float *wih, float *who, int inp, int hid, int out){
int tid = (blockIdx.x)*(blockDim.x) + threadIdx.x;
if (tid<=(inp+1)*hid ){
printf("\ncasual : %f", (float)curand_uniform(&state_wih[tid]));
wih[tid] = (float)curand_uniform(&state_wih[tid]);
}
if (tid<=(hid+1)*out){
who[tid] = (float)curand_uniform(&state_wih[tid]);
}
}
__global__ void init_rand(unsigned int seed, curandState_t *state_wih){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
curand_init(seed, tid, 0, &state_wih[tid]);
}