Question

问题是我以一种明显正确的方式在ubuntu 16.04 LTS中安装cuda 9.1（.deb），当我在cuda中编译程序时它没有显示错误但是在执行编译时没有给出预期的结果，似乎主机和设备之间没有内存传输。

示例example3.cu是这样的：

#include <stdio.h>
#include <iostream>    
using namespace std;    
__global__ void add(int *a, int *b, int *c){
    *c = *a + *b;
}

int main(void){
    int a, b, c;    
    int *d_a, *d_b, *d_c; 
    int size = sizeof(int);
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);
    a = 2;
    b = 7;
    cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
    //<<<N,L>>> 
    add<<<1,1>>>(d_a, d_b, d_c);
    cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cout<< "Result: "<<a<<" + "<<b<<" = " <<c<<endl; 
    return 0;    
}

编译

$ nvcc example3.cu -o example3.out 
$ ./example3.out 
Result: 2 + 7 = -571759751

以下是使用（使用线程中断）安装的一些细节：

$ lspci | grep -i nvidia
01:00.0 VGA compatible controller: NVIDIA Corporation GF108 [GeForce GT 730] (rev a1)
01:00.1 Audio device: NVIDIA Corporation GF108 High Definition Audio Controller (rev a1)
$ uname -a
Linux asis 4.13.0-37-generic #42~16.04.1-Ubuntu SMP Wed Mar 7 16:03:28 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
$ uname -m && cat /etc/*release
x86_64
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION="Ubuntu 16.04.4 LTS"
NAME="Ubuntu"
VERSION="16.04.4 LTS (Xenial Xerus)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 16.04.4 LTS"
VERSION_ID="16.04"
HOME_URL="http://www.ubuntu.com/"
SUPPORT_URL="http://help.ubuntu.com/"
BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/"
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial
$ gcc --version
gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
$ uname -r
4.13.0-37-generic
$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Nov__3_21:07:56_CDT_2017
Cuda compilation tools, release 9.1, V9.1.85
$ dmesg |grep NVRM
[    1.110081] NVRM: loading NVIDIA UNIX x86_64 Kernel Module  390.25  Wed Jan 24 20:02:43 PST 2018 (using threaded interrupts)
$ ls -l /usr/bin/ |grep nvidia
lrwxrwxrwx 1 root root          52 mar 21 11:53 nvidia-bug-report.sh -> /etc/alternatives/x86_64-linux-gnu_nvidia_bug_report
lrwxrwxrwx 1 root root          58 mar 21 11:53 nvidia-cuda-mps-control -> /etc/alternatives/x86_64-linux-gnu_nvidia-cuda-mps-control
lrwxrwxrwx 1 root root          57 mar 21 11:53 nvidia-cuda-mps-server -> /etc/alternatives/x86_64-linux-gnu_nvidia-cuda-mps-server
lrwxrwxrwx 1 root root          51 mar 21 11:53 nvidia-debugdump -> /etc/alternatives/x86_64-linux-gnu_nvidia-debugdump
-rwxr-xr-x 1 root root         270 ene 30 09:16 nvidia-detector
lrwxrwxrwx 1 root root          54 mar 21 11:53 nvidia-persistenced -> /etc/alternatives/x86_64-linux-gnu_nvidia_persistenced
-rwxr-xr-x 1 root root      216424 dic  1 19:03 nvidia-settings
lrwxrwxrwx 1 root root          45 mar 21 11:53 nvidia-smi -> /etc/alternatives/x86_64-linux-gnu_nvidia_smi
lrwxrwxrwx 1 root root          49 mar 21 11:53 nvidia-xconfig -> /etc/alternatives/x86_64-linux-gnu_nvidia_xconfig
$ lsmod | grep nvidia
nvidia_uvm            757760  0
nvidia_drm             40960  1
nvidia_modeset       1093632  6 nvidia_drm
drm_kms_helper        167936  1 nvidia_drm
drm                   360448  4 nvidia_drm,drm_kms_helper
nvidia              14323712  388 nvidia_modeset,nvidia_uvm
ipmi_msghandler        45056  2 nvidia,ipmi_devintf

Cudaexamples bin：

$ ./deviceQuery
./deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GT 730"
  CUDA Driver Version / Runtime Version          9.1 / 9.1
  CUDA Capability Major/Minor version number:    2.1
  Total amount of global memory:                 1982 MBytes (2078736384 bytes)
MapSMtoCores for SM 2.1 is undefined.  Default to use 64 Cores/SM
MapSMtoCores for SM 2.1 is undefined.  Default to use 64 Cores/SM
  ( 2) Multiprocessors, ( 64) CUDA Cores/MP:     128 CUDA Cores
  GPU Max Clock rate:                            1400 MHz (1.40 GHz)
  Memory Clock rate:                             810 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 131072 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (65535, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Supports Cooperative Kernel Launch:            No
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 9.1, CUDA Runtime Version = 9.1, NumDevs = 1
Result = PASS
$ ./deviceQueryDrv 
./deviceQueryDrv Starting...

CUDA Device Query (Driver API) statically linked version 
Detected 1 CUDA Capable device(s)

Device 0: "GeForce GT 730"
  CUDA Driver Version:                           9.1
  CUDA Capability Major/Minor version number:    2.1
  Total amount of global memory:                 1982 MBytes (2078736384 bytes)
MapSMtoCores for SM 2.1 is undefined.  Default to use 64 Cores/SM
MapSMtoCores for SM 2.1 is undefined.  Default to use 64 Cores/SM
  ( 2) Multiprocessors, ( 64) CUDA Cores/MP:     128 CUDA Cores
  GPU Max Clock rate:                            1400 MHz (1.40 GHz)
  Memory Clock rate:                             810 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 131072 bytes
  Max Texture Dimension Sizes                    1D=(65536) 2D=(65536, 65535) 3D=(2048, 2048, 2048)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size (x,y,z):    (65535, 65535, 65535)
  Texture alignment:                             512 bytes
  Maximum memory pitch:                          2147483647 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Supports Cooperative Kernel Launch:            No
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
Result = PASS

编辑example3.cu

#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
using namespace std;

__global__ void add(int *a, int *b, int *c){
    *c = *a + *b;
}

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(void){
    int a, b, c;    
    int *d_a, *d_b, *d_c; 
    int size = sizeof(int);
    gpuErrchk( cudaMalloc((void **)&d_a, size)  );
    gpuErrchk( cudaMalloc((void **)&d_b, size)   );
    gpuErrchk( cudaMalloc((void **)&d_c, size)  );
    a = 2;
    b = 7;
    gpuErrchk( cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice) );
    //<<<N,L>>> 
    add<<<1,1>>>(d_a, d_b, d_c);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    gpuErrchk( cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost) );
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cout<< "Result: "<<a<<" + "<<b<<" = " <<c<<endl; 
    return 0;
}

编译

$ nvcc example3.cu -o example3.out 
$ ./example3.out 
los valores son: 2 + 7 = 
GPUassert: no kernel image is available for execution on the device example3.cu 32

示例未编译cuda 9.1 - ubuntu 16.04 LTS - 使用线程中断

0 个答案: