问题是我以一种明显正确的方式在ubuntu 16.04 LTS中安装cuda 9.1(.deb),当我在cuda中编译程序时它没有显示错误但是在执行编译时没有给出预期的结果,似乎主机和设备之间没有内存传输。
示例example3.cu是这样的:
#include <stdio.h>
#include <iostream>
using namespace std;
__global__ void add(int *a, int *b, int *c){
*c = *a + *b;
}
int main(void){
int a, b, c;
int *d_a, *d_b, *d_c;
int size = sizeof(int);
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
a = 2;
b = 7;
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
//<<<N,L>>>
add<<<1,1>>>(d_a, d_b, d_c);
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
cout<< "Result: "<<a<<" + "<<b<<" = " <<c<<endl;
return 0;
}
编译
$ nvcc example3.cu -o example3.out
$ ./example3.out
Result: 2 + 7 = -571759751
以下是使用(使用线程中断)安装的一些细节:
$ lspci | grep -i nvidia
01:00.0 VGA compatible controller: NVIDIA Corporation GF108 [GeForce GT 730] (rev a1)
01:00.1 Audio device: NVIDIA Corporation GF108 High Definition Audio Controller (rev a1)
$ uname -a
Linux asis 4.13.0-37-generic #42~16.04.1-Ubuntu SMP Wed Mar 7 16:03:28 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
$ uname -m && cat /etc/*release
x86_64
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION="Ubuntu 16.04.4 LTS"
NAME="Ubuntu"
VERSION="16.04.4 LTS (Xenial Xerus)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 16.04.4 LTS"
VERSION_ID="16.04"
HOME_URL="http://www.ubuntu.com/"
SUPPORT_URL="http://help.ubuntu.com/"
BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/"
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial
$ gcc --version
gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
$ uname -r
4.13.0-37-generic
$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Nov__3_21:07:56_CDT_2017
Cuda compilation tools, release 9.1, V9.1.85
$ dmesg |grep NVRM
[ 1.110081] NVRM: loading NVIDIA UNIX x86_64 Kernel Module 390.25 Wed Jan 24 20:02:43 PST 2018 (using threaded interrupts)
$ ls -l /usr/bin/ |grep nvidia
lrwxrwxrwx 1 root root 52 mar 21 11:53 nvidia-bug-report.sh -> /etc/alternatives/x86_64-linux-gnu_nvidia_bug_report
lrwxrwxrwx 1 root root 58 mar 21 11:53 nvidia-cuda-mps-control -> /etc/alternatives/x86_64-linux-gnu_nvidia-cuda-mps-control
lrwxrwxrwx 1 root root 57 mar 21 11:53 nvidia-cuda-mps-server -> /etc/alternatives/x86_64-linux-gnu_nvidia-cuda-mps-server
lrwxrwxrwx 1 root root 51 mar 21 11:53 nvidia-debugdump -> /etc/alternatives/x86_64-linux-gnu_nvidia-debugdump
-rwxr-xr-x 1 root root 270 ene 30 09:16 nvidia-detector
lrwxrwxrwx 1 root root 54 mar 21 11:53 nvidia-persistenced -> /etc/alternatives/x86_64-linux-gnu_nvidia_persistenced
-rwxr-xr-x 1 root root 216424 dic 1 19:03 nvidia-settings
lrwxrwxrwx 1 root root 45 mar 21 11:53 nvidia-smi -> /etc/alternatives/x86_64-linux-gnu_nvidia_smi
lrwxrwxrwx 1 root root 49 mar 21 11:53 nvidia-xconfig -> /etc/alternatives/x86_64-linux-gnu_nvidia_xconfig
$ lsmod | grep nvidia
nvidia_uvm 757760 0
nvidia_drm 40960 1
nvidia_modeset 1093632 6 nvidia_drm
drm_kms_helper 167936 1 nvidia_drm
drm 360448 4 nvidia_drm,drm_kms_helper
nvidia 14323712 388 nvidia_modeset,nvidia_uvm
ipmi_msghandler 45056 2 nvidia,ipmi_devintf
Cudaexamples bin:
$ ./deviceQuery
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GT 730"
CUDA Driver Version / Runtime Version 9.1 / 9.1
CUDA Capability Major/Minor version number: 2.1
Total amount of global memory: 1982 MBytes (2078736384 bytes)
MapSMtoCores for SM 2.1 is undefined. Default to use 64 Cores/SM
MapSMtoCores for SM 2.1 is undefined. Default to use 64 Cores/SM
( 2) Multiprocessors, ( 64) CUDA Cores/MP: 128 CUDA Cores
GPU Max Clock rate: 1400 MHz (1.40 GHz)
Memory Clock rate: 810 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 131072 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per multiprocessor: 1536
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (65535, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Supports Cooperative Kernel Launch: No
Supports MultiDevice Co-op Kernel Launch: No
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 9.1, CUDA Runtime Version = 9.1, NumDevs = 1
Result = PASS
$ ./deviceQueryDrv
./deviceQueryDrv Starting...
CUDA Device Query (Driver API) statically linked version
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GT 730"
CUDA Driver Version: 9.1
CUDA Capability Major/Minor version number: 2.1
Total amount of global memory: 1982 MBytes (2078736384 bytes)
MapSMtoCores for SM 2.1 is undefined. Default to use 64 Cores/SM
MapSMtoCores for SM 2.1 is undefined. Default to use 64 Cores/SM
( 2) Multiprocessors, ( 64) CUDA Cores/MP: 128 CUDA Cores
GPU Max Clock rate: 1400 MHz (1.40 GHz)
Memory Clock rate: 810 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 131072 bytes
Max Texture Dimension Sizes 1D=(65536) 2D=(65536, 65535) 3D=(2048, 2048, 2048)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per multiprocessor: 1536
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (65535, 65535, 65535)
Texture alignment: 512 bytes
Maximum memory pitch: 2147483647 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Concurrent kernel execution: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Supports Cooperative Kernel Launch: No
Supports MultiDevice Co-op Kernel Launch: No
Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
Result = PASS
编辑example3.cu
#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
using namespace std;
__global__ void add(int *a, int *b, int *c){
*c = *a + *b;
}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main(void){
int a, b, c;
int *d_a, *d_b, *d_c;
int size = sizeof(int);
gpuErrchk( cudaMalloc((void **)&d_a, size) );
gpuErrchk( cudaMalloc((void **)&d_b, size) );
gpuErrchk( cudaMalloc((void **)&d_c, size) );
a = 2;
b = 7;
gpuErrchk( cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice) );
//<<<N,L>>>
add<<<1,1>>>(d_a, d_b, d_c);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost) );
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
cout<< "Result: "<<a<<" + "<<b<<" = " <<c<<endl;
return 0;
}
编译
$ nvcc example3.cu -o example3.out
$ ./example3.out
los valores son: 2 + 7 =
GPUassert: no kernel image is available for execution on the device example3.cu 32