在MATLAB Cuda中获取次要输出的随机值

时间:2016-12-22 18:16:51

标签: arrays matlab memory-management cuda gpu

问题: 我得到两个计算的数组和两个期望的输出

  1. 右计算输出
  2. 随机数,旧数字,其他数组中的数字
  3. 我正在使用MATLAB R2016B和这个Coda版本+ GPU:

     CUDADevice with properties:
    
                      Name: 'GeForce GT 525M'
                     Index: 1
         ComputeCapability: '2.1'
            SupportsDouble: 1
             DriverVersion: 8
            ToolkitVersion: 7.5000
        MaxThreadsPerBlock: 1024
          MaxShmemPerBlock: 49152
        MaxThreadBlockSize: [1024 1024 64]
               MaxGridSize: [65535 65535 65535]
                 SIMDWidth: 32
               TotalMemory: 1.0737e+09
           AvailableMemory: 947929088
       MultiprocessorCount: 2
              ClockRateKHz: 1200000
               ComputeMode: 'Default'
      GPUOverlapsTransfers: 1
    KernelExecutionTimeout: 1
          CanMapHostMemory: 1
           DeviceSupported: 1
            DeviceSelected: 1
    

    我现在尝试使用GPU添加和减去两个不同的数组并将其返回到MATLAB。

    MATLAB代码:

    n = 10;
    as = [1,1,1];
    bs = [10,10,10];
    
    for i = 2:n+1
      as(end+1,:) = [i,i,i];
      bs(end+1,:) = [10,10,10];
    end
    as = as *1;
    
    % Load the kernel
    cudaFilename = 'add2.cu';
    ptxFilename = ['add2.ptx'];
    
    % Check if the files are awareable
    if((exist(cudaFilename, 'file') || exist(ptxFilename, 'file')) == 2)
      error('CUDA FILES ARE NOT HERE');
    end
    kernel = parallel.gpu.CUDAKernel( ptxFilename, cudaFilename );
    
    % Make sure we have sufficient blocks to cover all of the locations
    kernel.ThreadBlockSize = [kernel.MaxThreadsPerBlock,1,1];
    kernel.GridSize = [ceil(n/kernel.MaxThreadsPerBlock),1];
    
    % Call the kernel
    outadd = zeros(n,1, 'single' );
    outminus = zeros(n,1, 'single' );
    [outadd, outminus] = feval( kernel, outadd,outminus, as, bs );
    

    Cuda片段

    #include "cuda_runtime.h"
    #include "add_wrapper.hpp"
    #include <stdio.h>
    
    __device__ size_t calculateGlobalIndex() {
        // Which block are we?
        size_t const globalBlockIndex = blockIdx.x + blockIdx.y * gridDim.x;
        // Which thread are we within the block?
        size_t const localThreadIdx = threadIdx.x + blockDim.x * threadIdx.y;
        // How big is each block?
        size_t const threadsPerBlock = blockDim.x*blockDim.y;
        // Which thread are we overall?
        return localThreadIdx + globalBlockIndex*threadsPerBlock;
    }
    
    __global__ void addKernel(float *c, float *d, const float *a, const  float *b)
    {
        int i = calculateGlobalIndex();
        c[i] = a[i] + b[i];
        d[i] = a[i] - b[i];
    }
    
    // C = A + B
    // D = A - B
    void addWithCUDA(float *cpuC,float *cpuD, const float *cpuA, const float *cpuB, const size_t sz)
    {   
    //TODO: add error checking
    
    // choose which GPU to run on
    cudaSetDevice(0);
    
    // allocate GPU buffers
    float *gpuA, *gpuB, *gpuC, *gpuD;
    cudaMalloc((void**)&gpuA, sz*sizeof(float));
    cudaMalloc((void**)&gpuB, sz*sizeof(float));
    cudaMalloc((void**)&gpuC, sz*sizeof(float));
    cudaMalloc((void**)&gpuD, sz*sizeof(float));
    cudaCheckErrors("cudaMalloc fail");
    
    // copy input vectors from host memory to GPU buffers
    cudaMemcpy(gpuA, cpuA, sz*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gpuB, cpuB, sz*sizeof(float), cudaMemcpyHostToDevice);
    
    // launch kernel on the GPU with one thread per element
    addKernel<<<1,sz>>>(gpuC, gpuD, gpuA, gpuB);
    
    // wait for the kernel to finish
    cudaDeviceSynchronize();
    
    // copy output vector from GPU buffer to host memory
    cudaMemcpy(cpuC, gpuC, sz*sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(cpuD, gpuD, sz*sizeof(float), cudaMemcpyDeviceToHost);
    
    
    // cleanup
    cudaFree(gpuA);
    cudaFree(gpuB);
    cudaFree(gpuC);
    cudaFree(gpuD);
    }
    
    void resetDevice()
    {
        cudaDeviceReset();
    }
    
    运行代码后,

    [outadd, outminus]是MATLAB中的两个GPU数组对象。

    始终正确计算

    Outaddoutminus很少正确,大多数时间内包含随机整数或浮点数,零或甚至outadd的值。

    如果我交换了算术运算的顺序,那么它对另一个算术运算了,所以不应该正确计算“outminus”吗?

1 个答案:

答案 0 :(得分:1)

使用@Robert Crovella提示不必要的线程可能导致访问错误,我只是为线程添加了一个限制。

<强> MATLAB

[outadd, outminus] = feval( kernel, outadd,outminus, as, bs, n);

CUDA KERNEL METHOD

__global__ void addKernel(float *c, float *d, const float *a, const float *b, const float n)
{  
    int i = calculateGlobalIndex();
    if ( i < n ){
        c[i] = a[i] + b[i];
        d[i] = a[i] - b[i];
   }
}

我认为它仍然不是最佳解决方案,因为即使最不应该使用那么多资源,GPU仍会启动所有线程。

以正确的方式重新处理后,我会将其上传到此处。