我有一个4096x4096矩阵乘以4096x1的矩阵,但是由于某种原因,由于非法访问,该程序无法将结果矩阵复制回主机内存。
float *h_Layer8_Neurons_GPU = (float *)malloc(sizeof(float) * 4096);
cudaMalloc((void**)&d_Layer7_Weights_GPU, sizeof(float) * (4096 * 4096));
cudaMalloc((void**)&d_Layer7_bias_GPU, sizeof(float) * 4096 );
cudaMalloc((void**)&d_Layer7_Neurons_GPU, sizeof(float) * 4096);
cudaMalloc((void**)&d_Layer8_Neurons_GPU, sizeof(float) * 4096);
//this is [4096x4096] * [4096x1] = [4096x1]
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, 4096, 4096, 1, &alpha,d_Layer7_Weights_GPU, 4096, d_Layer7_Neurons_GPU, 4096, &beta,d_Layer8_Neurons_GPU, 4096);
//failed here
err = cudaMemcpy(h_Layer8_Neurons_GPU, d_Layer8_Neurons_GPU, 4096 * sizeof(float), cudaMemcpyDeviceToHost);