Question

假设我有8个32个线程的块，每个块在GTX 970上运行。每个blcok要么将所有1或所有0写入全局内存中长度为32的数组，其中的线程为0块写入数组中的位置0。

现在编写实际值atomicExch，使用块尝试写入的值交换数组中的当前值。由于SIMD，原子操作以及warp以锁步执行的事实，我希望数组在任何时间点只包含1或0。但绝不是两者的混合。

然而，在运行这样的代码时，有几种情况下，在某个时间点，数组包含0和1的混合。这似乎表明原子操作不是每个warp执行的，而是使用其他方案安排的事实。

从其他来源我还没有真正找到一个结论性的文章，详细说明不同经线的原子操作的安排（如果我错了，请纠正我），所以我想知道是否有关于这个主题的任何信息。因为我需要将多个由几个32位整数组成的小向量原子地写入全局内存，并且保证以原子方式写入单个向量的原子操作显然非常重要。

对于那些想知道的人，我写的代码是在GTX 970上执行的，使用CUDA 8.0在计算能力5.2上编译。

Answer 1

原子指令与所有指令一样，按每个warp进行调度。然而，存在与原子相关联的未指定的管道，并且对于通过管道的每个阶段，不保证对于每个线程以锁步方式执行通过管道的调度指令流。这使您的观察成为可能。

我相信一个简单的思想实验将证明这必须是真的：如果同一个warp中的2个线程针对同一个位置怎么办？显然，处理的每个方面都无法继续进行。我们可以将这个思想实验扩展到我们在SM中甚至跨SM的每个时钟有多个问题的情况，作为附加示例。

如果向量长度足够短（16个字节或更少），则应该可以通过使warp中的线程写入适当的向量类型数量来实现此目的（＆＃34;原子更新＆＃34;），例如int4。只要所有线程（无论它们在网格中的哪个位置）都尝试更新自然对齐的位置，写入就不会被其他写入损坏。

然而，在评论中讨论之后，OP的目标似乎是能够使warp或threadblock更新一些长度的向量，而不受其他warp或threadblock的干扰。在我看来，真正需要的是访问控制（因此一次只有一个warp或threadblock更新特定的向量），OP有一些代码没有按照需要工作。

可以使用普通的原子操作（在下面的示例中为atomicCAS）强制执行此访问控制，以仅允许一个＆＃34;生产者＆＃34;一次更新一个矢量。

以下是一个示例生产者 - 消费者代码，其中有多个线程块正在更新一系列向量。每个向量＆＃34; slot＆＃34;有一个＆＃34;插槽控件＆＃34;变量，原子地更新以表示：

vector为空
正在填充矢量
向量已填充，准备好＆＃34;消费＆＃34;

#include <assert.h>
#include <iostream>
#include <stdio.h>

const int num_slots = 256;
const int slot_length = 32;
const int max_act = 65536;
const int slot_full = 2;
const int slot_filling = 1;
const int slot_empty = 0;
const int max_sm = 64;  // needs to be greater than the maximum number of SMs for any GPU that it will be run on
__device__ int slot_control[num_slots] = {0};
__device__ int slots[num_slots*slot_length];
__device__ int observations[max_sm] = {0}; // reported by consumer
__device__ int actives[max_sm] = {0};      // reported by producers
__device__ int correct = 0;
__device__ int block_id = 0;
__device__ volatile int restricted_sm = -1;
__device__ int num_act = 0;

static __device__ __inline__ int __mysmid(){
  int smid;
  asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
  return smid;}


// this code won't work on a GPU with a single SM!
__global__ void kernel(){

  __shared__ volatile int done, update, next_slot;
  int my_block_id = atomicAdd(&block_id, 1);
  int my_sm = __mysmid();
  if (my_block_id == 0){
    if (!threadIdx.x){
      restricted_sm = my_sm;
      __threadfence();
      // I am "block 0" and process the vectors, checking for coherency
      // "consumer"
      next_slot = 0;
      volatile int *vslot_control = slot_control;
      volatile int *vslots = slots;
      int scount = 0;
      while(scount < max_act){
        if (vslot_control[next_slot] == slot_full){
          scount++;
          int slot_val = vslots[next_slot*slot_length];
          for (int i = 1; i < slot_length; i++) if (slot_val != vslots[next_slot*slot_length+i]) { assert(0); /* badness - incoherence */}
          observations[slot_val]++;
          vslot_control[next_slot] = slot_empty;
          correct++;
          __threadfence();
          }
        next_slot++;
        if (next_slot >= num_slots) next_slot = 0;
        }
      }}
  else {
    // "producer"
    while (restricted_sm < 0);  // wait for signaling
    if (my_sm == restricted_sm) return;
    next_slot = 0;
    done = 0;
    __syncthreads();
    while (!done) {
      if (!threadIdx.x){
        while (atomicCAS(slot_control+next_slot, slot_empty,  slot_filling) > slot_empty) {
          next_slot++;
          if (next_slot >= num_slots) next_slot = 0;}
        // we grabbed an empty slot, fill it with my_sm
        if (atomicAdd(&num_act, 1) < max_act)   update = 1;
        else {done = 1; update = 0;}
        }
      __syncthreads();

      if (update) slots[next_slot*slot_length+threadIdx.x] = my_sm;
      __threadfence(); //enforce ordering
      if ((update) && (!threadIdx.x)){
        slot_control[next_slot] = 2; // mark slot full
        atomicAdd(actives+my_sm, 1);}
      __syncthreads();
    }
  }
}

int main(){

  kernel<<<256, slot_length>>>();
  cudaDeviceSynchronize();
  cudaError_t res= cudaGetLastError();
  if (res != cudaSuccess) printf("kernel failure: %d\n", (int)res);
  int *h_obs = new int[max_sm];
  int *h_act = new int[max_sm];
  int h_correct;
  cudaMemcpyFromSymbol(h_obs, observations, sizeof(int)*max_sm);
  cudaMemcpyFromSymbol(h_act, actives, sizeof(int)*max_sm);
  cudaMemcpyFromSymbol(&h_correct, correct, sizeof(int));
  int h_total_act = 0;
  int h_total_obs = 0;
  for (int i = 0; i < max_sm; i++){
    std::cout << h_act[i] << "," << h_obs[i] << " ";
    h_total_act += h_act[i];
    h_total_obs += h_obs[i];}
  std::cout << std::endl << h_total_act << "," << h_total_obs << "," << h_correct << std::endl;
}

我不会声明此代码对于任何用例都没有缺陷。它是先进的，以证明概念的可行性，而不是生产就绪的代码。它似乎适用于我在Linux上，在我测试它的几个不同的系统上。它不应该在只有一个SM的GPU上运行，因为一个SM是为消费者保留的，剩下的SM是由生产者使用的。

CUDA中的原子操作是否保证按经线进行调度？

1 个答案: