Question

我有N个迭代和L个内存位置。每次迭代都写入0到（L-1）个存储单元中的任何一个。

我想根据迭代次数将数据存储在全局内存中。假设迭代K和K-1都写入相同的位置。全局内存中的最终结果应为K.线程K-1不应覆盖线程K的结果。

我尝试过以下解决方案。我为每个内存位置创建一个具有最大迭代次数的数组，并初始化为-1。然后我检查迭代次数是否大于当前值。如果是，我存储值并更新最大迭代次数。

这是我当前的代码，但它没有为更大的迭代提供正确的结果。

#include<stdio.h>
#include"cuda.h"
__global__ void  fun(int *A,int *maxIndex,int *index1,int *lock)
{
                    int threadid=blockIdx.x*blockDim.x+threadIdx.x;
                    int iteration_no=threadid;
                    int index=index1[threadid];
                    int exitFromLoop=1;
                    while(exitFromLoop==1)
                   {
                            int flag=atomicCAS(&lock[index],0,1);
                     if(flag==0)


                           {
                                    if(maxIndex[index]<iteration_no)
                                    {       
                                            A[index]=89;
                                            maxIndex[index]=iteration_no;
                                             __threadfence();
                                    }
                                    else
                                    {
                                    }
                                    //__threadfence();
                                    lock[index]=0;
                                    exitFromLoop=0;
                            }
                            else
                            {

                            }
                    }

}

int main()
{
    int A[10]={10,20,30,40,50,60,70,80,90,100};
    int maxIndex[10]={-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};
    int lock[10]={0,0,0,0,0,0,0,0,0,0};
    int index[8192];
    srand(0);
    for(int ii=0;ii<8192;ii++)
    {
            index[ii]=rand()%10;
    }
    int *index1;
    int *A1,*maxIndex1;
    int *lock1;
     cudaMalloc((void**)&lock1,sizeof(int)*10);
    cudaMalloc((void**)&A1,sizeof(int)*10);
    cudaMalloc((void**)&index1,sizeof(int)*8192);
    cudaMalloc((void**)&maxIndex1,sizeof(int)*10);
    cudaMemcpy(A1,&A,sizeof(int)*10,cudaMemcpyHostToDevice);
    cudaMemcpy(lock1,&lock,sizeof(int)*10,cudaMemcpyHostToDevice);
    cudaMemcpy(maxIndex1,&maxIndex,sizeof(int)*10,cudaMemcpyHostToDevice);
    cudaMemcpy(index1,&index,sizeof(int)*8192,cudaMemcpyHostToDevice);
    fun<<<16,512>>>(A1,maxIndex1,index1,lock1);
    cudaMemcpy(&A,A1,sizeof(int)*10,cudaMemcpyDeviceToHost);
    cudaMemcpy(&maxIndex,maxIndex1,sizeof(int)*10,cudaMemcpyDeviceToHost);
    printf("\nindex \n");
    for(int i=0;i<8192;i++)
    {
            printf("%d\n",index[i]);
    }

    for(int i=0;i<10;i++)
    {
            printf(" %d max is %d\n",A[i],maxIndex[i]);
    }
}

Answer 1

我认为这可能就是你所追求的目标。

对于A数组中的每个元素，maxIndex数组中有一个对应的元素，其中包含更新iteration_no中元素的最后一个线程的A阵列。如果当前线程的iteration_no高于maxIndex，则iteration_no将更新为当前线程的A，并且线程会更新A中的元素。

如果当前线程的iteration_no较低，则iteration_no元素不会更新，maxIndex中的#include<stdio.h> #include"cuda.h" __global__ void fun(int *A,int *maxIndex,int *index) { int iteration_no=blockIdx.x*blockDim.x+threadIdx.x; int i=index[iteration_no]; if (atomicMax(maxIndex + i, iteration_no) < iteration_no) { A[i] = 89; } } int main() { int A[10] = {10,20,30,40,50,60,70,80,90,100}; int maxIndex[10]={-1}; int index[8192]; srand(0); for(int ii=0;ii<8192;ii++) { index[ii]=rand()%10; } int *index_d; int *A_d,*maxIndex_d; cudaMalloc((void**)&A_d,sizeof(int)*10); cudaMalloc((void**)&index_d,sizeof(int)*8192); cudaMalloc((void**)&maxIndex_d,sizeof(int)*10); cudaMemcpy(A_d,&A,sizeof(int)*10,cudaMemcpyHostToDevice); cudaMemcpy(maxIndex_d,&maxIndex,sizeof(int)*10,cudaMemcpyHostToDevice); cudaMemcpy(index_d,&index,sizeof(int)*8192,cudaMemcpyHostToDevice); fun<<<16,512>>>(A_d,maxIndex_d,index_d); cudaMemcpy(&A,A_d,sizeof(int)*10,cudaMemcpyDeviceToHost); cudaMemcpy(&maxIndex,maxIndex_d,sizeof(int)*10,cudaMemcpyDeviceToHost); printf("\nindex \n"); for(int i=0;i<8192;i++) { printf("%d\n",index[i]); } for(int i=0;i<10;i++) { printf(" %d max is %d\n",A[i],maxIndex[i]); } }不会更新。

{{1}}

仅存储迭代次数最少的循环中的值

1 个答案: