Question

你好我是cuda编程的初学者。我使用lock.lock（）函数等待以前的线程完成工作。这是我的代码：

#include "book.h"
#include <cuda.h>
#include <conio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include <string>
#include <curand.h>
#include <curand_kernel.h>
#include "lock.h"
#define pop 10
#define gen 10
#define pg pop*gen
using namespace std;
__global__ void hold(Lock lock,float* a )
{
    __shared__ int cache[gen];
int tid=blockIdx.x * blockDim.x+threadIdx.x;
int cacheIndex = threadIdx.x;
if(tid<gen)
{
    a[tid]=7;//this number example but in my chase this random number
}
else
{
    //cache[cacheIndex]=a[tid];
    int temp;
        if(tid%gen==0)
        {

            a[tid]=tid+4;//this example number but in my chase this random number if tid==tid%gen
            temp=a[tid];
            tid+=blockIdx.x*gridDim.x;

        }
        else
        {
            __syncthreads();
            a[tid]=temp+1;//this must a[tid]=a[tid-1]+1;
            temp=a[tid];
            tid+=blockIdx.x*gridDim.x;

        }

    cache[cacheIndex]=temp;
    __syncthreads();
    for (int i=0;i<gen;i++)
    {
        if(cacheIndex==i)
        {
            lock. lock();
            cache[cacheIndex]=temp;
            lock.unlock();
        }
    }


}

}
int main()
{
float time;
float* a=new float [pg];
float *dev_a;

HANDLE_ERROR( cudaMalloc( (void**)&dev_a,pg *sizeof(int) ) );
Lock lock;
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );
hold<<<pop,gen>>>(lock,dev_a);
HANDLE_ERROR( cudaMemcpy( a, dev_a,pg * sizeof(float),cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );
for(int i=0;i<pop;i++)
{
    for(int j=0;j<gen;j++)
    {
        cout<<a[(i*gen)+j]<<" ";
    }
    cout<<endl;
}
printf("hold:  %3.1f ms \n", time);
HANDLE_ERROR(cudaFree(dev_a));
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
system("pause");
return 0;
}

这就是结果：

7 7 7 7 7 7 7 7 7 7

14 0 0 0 0 0 0 0 0 0

24 0 0 0 0 0 0 0 0 0

34 0 0 0 0 0 0 0 0 0

44 0 0 0 0 0 0 0 0 0

54 0 0 0 0 0 0 0 0 0

64 0 0 0 0 0 0 0 0 0

74 0 0 0 0 0 0 0 0 0

84 0 0 0 0 0 0 0 0 0

94 0 0 0 0 0 0 0 0 0

我的预期结果：

7 7 7 7 7 7 7 7 7 7

14 15 16 17 18 19 20 21 22 23

24 25 26 27 28 29 23 31 32 33

34 35 36 37 38 39 40 41 42 43

44 45 46 47 48 49 50 51 52 53

54 55 56 57 58 59 60 61 62 63

64 65 66 67 68 69 70 71 72 73

74 75 76 77 78 79 80 81 82 83

84 85 86 87 88 89 90 91 92 93

94 95 96 97 98 99 100 101 102 103

任何人请帮我纠正我的代码。谢谢

Answer 1

如果您需要帮助，请注意您的部分代码（例如lock.h和book.h）来自CUDA示例书，这将非常有用。这不是CUDA的标准部分，因此如果您没有指出CUDA的来源，可能会造成混淆。

我在您的代码中看到以下问题：

您在条件块中使用__syncthreads()，并非所有线程都符合__syncthreads()障碍：
```
if(tid%gen==0)
{
  ...
}
else
{
    __syncthreads();  // illegal

}
```
以这种方式使用__syncthreads() is illegal，因为并非所有线程都能够达到__syncthreads()障碍：

__ syncthreads（）在条件代码中是允许的，但仅当条件在整个线程块中进行相同的求值时，否则代码执行可能会挂起或产生意外的副作用。

您正在使用temp局部变量而不首先初始化它：
```
    a[tid]=temp+1;//this must a[tid]=a[tid-1]+1;
```
请注意temp是线程本地变量。它不是在线程之间共享的。因此，上面的代码行（对于else块中的线程）使用了temp的单位化值。
内核代码的其余部分：
```
    cache[cacheIndex]=temp;
    __syncthreads();
    for (int i=0;i<gen;i++)
    {
      if(cacheIndex==i)
      {
        lock. lock();
        cache[cacheIndex]=temp;
        lock.unlock();
      }
    }


}
```
没有任何用处因为它正在更新共享内存位置（即cache），它们永远不会转移回dev_a变量，即全局内存。因此，这些代码都不会影响您打印的结果。

很难按照您在代码中尝试完成的操作。但是，如果您更改此行（未初始化的值）：

    int temp;

到此：

    int temp=tid+3;

您的代码将根据您显示的内容打印出数据。

线程工作如果先前线程完成工作（cuda）在同一块中

1 个答案: