我有一个代码,其中一个部分需要严格执行。我正在为这段代码使用一个锁,以便内核的每个线程(每个块设置一个线程)以原子方式执行该段代码。线程的顺序让我困扰 - 我需要线程按照时间顺序执行,根据它们的索引(或实际上,按照它们的blockIdx的顺序),从0到10(而不是随机,例如5,8,3, 0,......等)。有可能吗?
以下是一个示例代码:
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<math_functions.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>
// number of blocks
#define nob 10
struct Lock{
int *mutex;
Lock(void){
int state = 0;
cudaMalloc((void**) &mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void){
cudaFree(mutex);
}
__device__ void lock(void){
while(atomicCAS(mutex, 0, 1) != 0);
}
__device__ void unlock(void){
atomicExch(mutex, 0);
}
};
__global__ void theKernel(Lock myLock){
int index = blockIdx.x; //using only one thread per block
// execute some parallel code
// critical section of code (thread with index=0 needs to start, followed by index=1, etc.)
myLock.lock();
printf("Thread with index=%i inside critical section now...\n", index);
myLock.unlock();
}
int main(void)
{
Lock myLock;
theKernel<<<nob, 1>>>(myLock);
return 0;
}
给出了以下结果:
Thread with index=1 inside critical section now...
Thread with index=0 inside critical section now...
Thread with index=5 inside critical section now...
Thread with index=9 inside critical section now...
Thread with index=7 inside critical section now...
Thread with index=6 inside critical section now...
Thread with index=3 inside critical section now...
Thread with index=2 inside critical section now...
Thread with index=8 inside critical section now...
Thread with index=4 inside critical section now...
我希望这些索引从0开始并按时间顺序执行到9。
我认为修改Lock以实现此目的的一种方法如下:
struct Lock{
int *indexAllow;
Lock(void){
int startVal = 0;
cudaMalloc((void**) &indexAllow, sizeof(int));
cudaMemcpy(indexAllow, &startVal, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void){
cudaFree(indexAllow);
}
__device__ void lock(int index){
while(index!=*indexAllow);
}
__device__ void unlock(void){
atomicAdd(indexAllow,1);
}
};
然后通过将索引作为参数传递来初始化锁:
myLock.lock(index);
但这会让我的电脑停滞不前......我可能会错过一些明显的东西。
如果有人可以提供帮助,我会很感激!
感谢!!!
答案 0 :(得分:2)
我改变了你的代码。现在它产生你想要的输出:
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<math_functions.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>
// number of blocks
#define nob 10
struct Lock{
int *mutex;
Lock(void){
int state = 0;
cudaMalloc((void**) &mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void){
cudaFree(mutex);
}
__device__ void lock(uint compare){
while(atomicCAS(mutex, compare, 0xFFFFFFFF) != compare); //0xFFFFFFFF is just a very large number. The point is no block index can be this big (currently).
}
__device__ void unlock(uint val){
atomicExch(mutex, val+1);
}
};
__global__ void theKernel(Lock myLock){
int index = blockIdx.x; //using only one thread per block
// execute some parallel code
// critical section of code (thread with index=0 needs to start, followed by index=1, etc.)
myLock.lock(index);
printf("Thread with index=%i inside critical section now...\n", index);
__threadfence_system(); // For the printf. I'm not sure __threadfence_system() can guarantee the order for calls to printf().
myLock.unlock(index);
}
int main(void)
{
Lock myLock;
theKernel<<<nob, 1>>>(myLock);
return 0;
}
lock()
函数接受compare
作为参数,并检查它是否等于mutex
中的alraedy值。如果是,则将0xFFFFFFFF
放入mutex
以指示线程获取锁定。因为mutex
在构造函数中初始化为0,所以只有块ID为0的线程才能成功获取锁。在unlock
中,我们将下一个块ID索引放入mutex
以保证您所需的订购。另外,因为您在CUDA内核中使用了printf()
,所以我认为需要调用threadfence_system()
才能以相同的顺序在输出中查看。