Question

我在我的算法中使用UVA和OpenMP来使它变得强大。

问题在于，当我启动并行内核时，例如，3个CPU线程同时启动一个内核。一个线程有nan值。

似乎GPU X无法从GPU0读取变量。

考虑到我将每个GPU的访问权限授予0（在这种情况下为1和2），这很奇怪。

一起使用UVA和OpenMP是否有问题？或者是代码问题？

以下是代码和结果。

我在这里创建了一个MCVE来演示错误：

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "math_constants.h"
#include <omp.h>
#include <cufft.h>

inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
  #ifdef _WIN32
     return (bool)(pProp->tccDriver ? true : false);
  #else
     return (bool)(pProp->major >= 2);
  #endif
}

inline bool IsAppBuiltAs64()
{ 
  #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
     return 1;
  #else
     return 0;
  #endif
}

__global__ void kernelFunction(cufftComplex *I, int i, int N)
{
    int j = threadIdx.x + blockDim.x * blockIdx.x;
    int k = threadIdx.y + blockDim.y * blockIdx.y;

   if(j==0 & k==0){
       printf("I'm thread %d and I'm reading device_I[0] = %f\n", i, I[N*j+k].x);
   }
}

__host__ int main(int argc, char **argv) {
int num_gpus;
cudaGetDeviceCount(&num_gpus);

if(num_gpus < 1){
    printf("No CUDA capable devices were detected\n");
        return 1;
}

if (!IsAppBuiltAs64()){
    printf("%s is only supported with on 64-bit OSs and the application must be built as a 64-bit target. Test is being waived.\n", argv[0]);
    exit(EXIT_SUCCESS);
}



printf("Number of host CPUs:\t%d\n", omp_get_num_procs());
printf("Number of CUDA devices:\t%d\n", num_gpus);


for(int i = 0; i < num_gpus; i++){
    cudaDeviceProp dprop;
        cudaGetDeviceProperties(&dprop, i);

        printf("> GPU%d = \"%15s\" %s capable of Peer-to-Peer (P2P)\n", i, dprop.name, (IsGPUCapableP2P(&dprop) ? "IS " : "NOT"));

        //printf("   %d: %s\n", i, dprop.name);
}
printf("---------------------------\n");


num_gpus = 3; //The case that fails
omp_set_num_threads(num_gpus);

if(num_gpus > 1){
  for(int i=1; i<num_gpus; i++){
        cudaDeviceProp dprop0, dpropX;
        cudaGetDeviceProperties(&dprop0, 0);
        cudaGetDeviceProperties(&dpropX, i);
        int canAccessPeer0_x, canAccessPeerx_0;
        cudaDeviceCanAccessPeer(&canAccessPeer0_x, 0, i);
        cudaDeviceCanAccessPeer(&canAccessPeerx_0 , i, 0);
        printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dprop0.name, 0, dpropX.name, i, canAccessPeer0_x ? "Yes" : "No");
            printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dpropX.name, i, dprop0.name, 0, canAccessPeerx_0 ? "Yes" : "No");
        if(canAccessPeer0_x == 0 || canAccessPeerx_0 == 0){
            printf("Two or more SM 2.0 class GPUs are required for %s to run.\n", argv[0]);
            printf("Support for UVA requires a GPU with SM 2.0 capabilities.\n");
            printf("Peer to Peer access is not available between GPU%d <-> GPU%d, waiving test.\n", 0, i);
            exit(EXIT_SUCCESS);
        }else{
            cudaSetDevice(0);
                printf("Granting access from 0 to %d...\n", i);
            cudaDeviceEnablePeerAccess(i,0);
            cudaSetDevice(i);
                printf("Granting access from %d to 0...\n", i);
            cudaDeviceEnablePeerAccess(0,0);

            printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 0, 1);
            const bool has_uva = (dprop0.unifiedAddressing && dpropX.unifiedAddressing);
            printf("> %s (GPU%d) supports UVA: %s\n", dprop0.name, 0, (dprop0.unifiedAddressing ? "Yes" : "No"));
                printf("> %s (GPU%d) supports UVA: %s\n", dpropX.name, i, (dpropX.unifiedAddressing ? "Yes" : "No"));
            if (has_uva){
                    printf("Both GPUs can support UVA, enabling...\n");
                }
                else{
                    printf("At least one of the two GPUs does NOT support UVA, waiving test.\n");
                    exit(EXIT_SUCCESS);
                }
        }
    }
}

int M = 512;
int N = 512;

cufftComplex *host_I = (cufftComplex*)malloc(M*N*sizeof(cufftComplex));
for(int i=0;i<M;i++){
    for(int j=0;j<N;j++){
        host_I[N*i+j].x = 0.001;
        host_I[N*i+j].y = 0;
    }
}

cufftComplex *device_I;
cudaSetDevice(0);
cudaMalloc((void**)&device_I, sizeof(cufftComplex)*M*N);
cudaMemset(device_I, 0, sizeof(cufftComplex)*M*N);
cudaMemcpy2D(device_I, sizeof(cufftComplex), host_I, sizeof(cufftComplex), sizeof(cufftComplex), M*N, cudaMemcpyHostToDevice);

dim3 threads(32,32);
dim3 blocks(M/threads.x, N/threads.y);
dim3 threadsPerBlockNN = threads;
dim3 numBlocksNN = blocks;
#pragma omp parallel
{
    unsigned int i = omp_get_thread_num();
    unsigned int num_cpu_threads = omp_get_num_threads();

    // set and check the CUDA device for this CPU thread
    int gpu_id = -1;
    cudaSetDevice(i % num_gpus);   // "% num_gpus" allows more CPU threads than GPU devices
    cudaGetDevice(&gpu_id);
    //printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
    kernelFunction<<<numBlocksNN, threadsPerBlockNN>>>(device_I, i, N);
    cudaDeviceSynchronize();
}

cudaFree(device_I);

for(int i=1; i<num_gpus; i++){
    cudaSetDevice(0);
    cudaDeviceDisablePeerAccess(i);
    cudaSetDevice(i);
    cudaDeviceDisablePeerAccess(0);
}

for(int i=0; i<num_gpus; i++ ){
    cudaSetDevice(i);
    cudaDeviceReset();
}

free(host_I);




}

结果是：

Both GPUs can support UVA, enabling...

I'm thread 0 and I'm reading device_I[0] = 0.001000

I'm thread 2 and I'm reading device_I[0] = 0.001000

I'm thread 1 and I'm reading device_I[0] = -nan

要编译的命令行是：

nvcc -Xcompiler -fopenmp -lgomp -arch=sm_37 main.cu -lcufft

这是simpleP2P的the result：

[miguel.carcamo@belka simpleP2P]$ ./simpleP2P 
[./simpleP2P] - Starting...
Checking for multiple GPUs...
CUDA-capable device count: 8
> GPU0 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU1 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU2 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU3 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU4 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU5 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU6 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)
> GPU7 = "      Tesla K80" IS  capable of Peer-to-Peer (P2P)

Checking GPU(s) for support of peer to peer memory access...
> Peer-to-Peer (P2P) access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
> Peer-to-Peer (P2P) access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
Enabling peer access between GPU0 and GPU1...
Checking GPU0 and GPU1 for UVA capabilities...
> Tesla K80 (GPU0) supports UVA: Yes
> Tesla K80 (GPU1) supports UVA: Yes
Both GPUs can support UVA, enabling...
Allocating buffers (64MB on GPU0, GPU1 and CPU Host)...
Creating event handles...
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: 0.79GB/s
Preparing host buffer and memcpy to GPU0...
Run kernel on GPU1, taking source data from GPU0 and writing to GPU1...
Run kernel on GPU0, taking source data from GPU1 and writing to GPU0...
Copy data back to host from GPU0 and verify results...
Verification error @ element 0: val = nan, ref = 0.000000
Verification error @ element 1: val = nan, ref = 4.000000
Verification error @ element 2: val = nan, ref = 8.000000
Verification error @ element 3: val = nan, ref = 12.000000
Verification error @ element 4: val = nan, ref = 16.000000
Verification error @ element 5: val = nan, ref = 20.000000
Verification error @ element 6: val = nan, ref = 24.000000
Verification error @ element 7: val = nan, ref = 28.000000
Verification error @ element 8: val = nan, ref = 32.000000
Verification error @ element 9: val = nan, ref = 36.000000
Verification error @ element 10: val = nan, ref = 40.000000
Verification error @ element 11: val = nan, ref = 44.000000
Enabling peer access...
Shutting down...
Test failed!

Answer 1

根据评论中的调试，似乎问题最终与正在使用的系统有关，而不是OP的代码。

K80是双GPU设备，因此它有一个PCIE桥接芯片。正确使用此配置，尤其是在使用点对点（P2P）流量时，需要在上游PCIE交换机和/或根联合体中进行适当的设置。这些设置通常由系统BIOS进行，通常不是/通常是软件可配置的。

当这些设置不正确时，一个可能的指示是simpleP2P CUDA示例代码将在结果验证期间报告错误。因此，对于您遇到P2P代码问题的任何系统的良好测试是运行此特定的CUDA示例代码（simpleP2P）。如果报告了验证错误（请参阅OP发布的an example），那么在尝试调试用户的P2P代码之前，应首先解决这些问题。

最佳建议是使用已由系统供应商验证的系统，以用于K80。对于任何特斯拉GPU的使用，这通常都是一种很好的做法，因为这些GPU往往从以下角度对主机系统提出了很高的要求：

电力传递
冷却要求
系统兼容性（两个示例是此处讨论的PCIE设置类型，以及OP在评论中也提到的资源映射和可启动性问题）

OEM验证系统通常会遇到与Tesla GPU在主机系统上放置的上述要求/要求相关的最少问题。

对于此特定问题，故障排除从simpleP2P测试开始。在该测试中观察到验证错误（但未报告其他CUDA运行时错误）时，可能会怀疑PCIE设置。尝试解决这些问题的最简单方法是检查更新/更新的系统BIOS，该BIOS可能具有适合此类用途的设置，否则将提供允许用户进行必要更改的BIOS设置选项。此处涉及的设置是PCIE ACS设置，如果BIOS设置选项可用，则可能涉及这些术语。由于BIOS设置因系统而异，因此无法在此处具体说明。

如果BIOS更新和/或设置修改无法解决问题，则可能无法修复该特定系统类型。使用here描述的最后步骤可以进一步排除进程故障，但是这样的故障排除，即使成功，也无法在没有BIOS修改的情况下导致永久性（即将在重新启动后继续）修复。

如果simpleP2P测试正确运行，则调试焦点应返回用户的代码。适用于使用proper cuda error checking并使用cuda-memcheck运行代码的一般建议。此外，simpleP2P示例源代码可以被称为正确使用P2P功能的示例。

请注意，通常，P2P支持可能因GPU或GPU系列而异。在一个GPU类型或GPU系列上运行P2P的能力并不一定表明它可以在另一种GPU类型或系列上运行，即使在同一系统/设置中也是如此。 GPU P2P支持的最终决定因素是通过cudaDeviceCanAccessPeer查询运行时提供的工具。 P2P支持也可能因系统和其他因素而异。此处所做的任何陈述都不是任何特定设置中任何特定GPU支持P2P的保证。

在特斯拉K80群集中使用Peer-to-Peer获得nan结果

1 个答案: