Question

我在Visual Studio 2010 Professional环境中的Windows 7 64位上运行CUDA 4.2

首先，我运行了以下代码：

// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

using namespace std; 

//kernel function
__global__ 
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid==0)
       d_bStopPtr[tid]=0;
    else if(tid<count)
    {
       d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
        if (d_bStopPtr[tid-1]==0 )
           d_bStopPtr[tid]=0;

    }
}

int main()
{
    int count=100000;
// define the vectors
    thrust::host_vector <int> h_a(count);
    thrust::device_vector <int> d_b(count,0);
    int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
    thrust::device_vector <int> d_bStop(count,1);
    int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    int threadsPerBlock = prop.maxThreadsDim[0];
    int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
    while(d_bStop[count-1])
    {
    dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
    }
//copy device back to host again
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
    int x;
    cin>>x;
    return 0;
}

然而，每次我需要检查while条件，但它很慢。所以我正在考虑在内核中检查这个设备向量的状态，并改变代码：

// include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

using namespace std; 

//kernel function
__global__ 
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid==0)
    d_bStopPtr[tid]=0;
else if(tid<count)
    {
// if the last cell of the arrary is still not 0 yet, repeat
        while(d_bStopPtr[count-1])
        {
            d_bPtr[tid]=tid;
// only if the arrary cell before it is 0, then change it to 0 too
            if (d_bStopPtr[tid-1]==0 )
                d_bStopPtr[tid]=0;
        }
    }
}

int main()
{
    int count=100000;
// define the vectors
    thrust::host_vector <int> h_a(count);
    thrust::device_vector <int> d_b(count,0);
    int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
    thrust::device_vector <int> d_bStop(count,1);
    int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    int threadsPerBlock = prop.maxThreadsDim[0];
    int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
//copy device to host
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
    dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
//copy device back to host again
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
    int x;
    cin>>x;
    return 0;
}

但是，第二个版本总是导致图形卡和计算机挂起。你可以帮助我加快第一个版本的速度吗？如何检查内核中的条件然后跳出并停止内核？

Answer 1

您基本上在寻找全局线程同步行为。这是GPU编程中的禁忌。理想情况下，每个线程块都是独立的，可以根据自己的数据和处理来完成工作。创建依赖于其他线程块的结果来完成其工作的线程块会产生死锁条件的可能性。假设我有一个带有14个SM（线程块执行单元）的GPU，并假设我创建了100个线程块。现在假设线程块0-13正在等待线程块99释放锁（例如，将零值写入特定位置）。现在假设前14个线程块开始在14个SM上执行，可能是循环，在锁定值上旋转。如果线程块0-13绑定了SM，则GPU中没有机制来保证线程块99将首先执行甚至根本不执行。

让我们不要回答“关于强制驱逐线程块0-13的GMEM停顿怎么样”的问题，因为这些都不能保证线程块99在任何时候都会优先执行。保证线程块99将执行的唯一事情是其他线程块的耗尽（即完成）。但是如果其他线程正在旋转，则等待来自线程块99的结果，这可能永远不会发生。

良好的前向兼容，可扩展的GPU代码取决于独立的并行工作。因此，建议您重新设计算法，使您尝试独立完成的工作，至少在跨线程块级别。

如果你必须进行全局线程同步，那么内核启动是唯一真正保证的，因此你的第一种方法是工作方法。

为了帮助解决这个问题，研究如何在GPU上实现缩减算法可能很有用。各种类型的约简都具有所有线程的依赖关系，但通过创建中间结果，我们可以将工作分解为独立的部分。然后可以使用多内核方法（或其他一些更高级的方法）聚合独立部分，以加速相当于串行算法。

你的内核实际上做得不多。它设置一个等于它的索引的数组，即a [i] = i;并且它将另一个数组设置为全零（尽管是顺序的）b [i] = 0;。要显示第一个代码“加速”的示例，您可以执行以下操作：

    // include the header files
#include <iostream>
#include <stdio.h>
#include <time.h>

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

using namespace std;

//kernel function
__global__
void dosomething(int *d_bPtr, int count, int* d_bStopPtr)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while(tid<count)
    {
      d_bPtr[tid]=tid;
      while(d_bStopPtr[tid]!=0)
// only if the arrary cell before it is 0, then change it to 0 too
        if (tid==0) d_bStopPtr[tid] =0;
        else if (d_bStopPtr[tid-1]==0 )
               d_bStopPtr[tid]=0;
      tid += blockDim.x;
    }
}

int main()
{
    int count=100000;
// define the vectors
    thrust::host_vector <int> h_a(count);
    thrust::device_vector <int> d_b(count,0);
    int* d_bPtr=thrust::raw_pointer_cast(&d_b[0]);
    thrust::device_vector <int> d_bStop(count,1);
    int* d_bStopPtr=thrust::raw_pointer_cast(&d_bStop[0]);
// get the device property
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

//    int threadsPerBlock = prop.maxThreadsDim[0];
    int threadsPerBlock = 32;
//    int blocksPerGrid = min(prop.maxGridSize[0], (count + threadsPerBlock - 1) / threadsPerBlock);
    int blocksPerGrid = 1;
//copy device to host
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//run the kernel
//    while(d_bStop[count-1])
//    {
    dosomething<<<blocksPerGrid, threadsPerBlock>>>(d_bPtr,count,d_bStopPtr);
//    }
//copy device back to host again
    cudaDeviceSynchronize();
    thrust::copy(d_b.begin(),d_b.end(),h_a.begin());
    cout<<h_a[100]<<"\t"<<h_a[200]<<"\t"<<h_a[300]<<"\t"<<endl;
//wait to see the console output
    int x;
    cin>>x;
    return 0;
}

在我的机器上，这将执行时间从10秒加速到几乎瞬间（远小于1秒）。请注意，这不是CUDA编程的一个很好的例子，因为我只启动了一个32个线程的块。这还不足以有效地利用机器。但是你的内核完成的工作是如此微不足道，以至于我不确定一个好的例子是什么。我可以创建一个内核，将一个数组设置为索引a [i] = i;另一个数组为零b [i] = 0;所有的并行。那会更快，我们可以这样使用整台机器。

检查CUDA内核中的device_vector不起作用

1 个答案: