Question

在普通的C ++中，如果我说以下内容，那么它是安全的，因为第三个子句将被跳过执行。我只是想知道cuda内核代码是否也会有这个属性，或者它是否考虑到最大化并行性？

int x[100] = {...}, i = -1;
if (i < 0 || i >= 100 || x[i] == 0) {
  // do something.
}

编辑：

从杰克的程序中，以下程序运行正常并输出“10”。做cuda-memcheck没有错误。

#include <stdio.h>

__global__ void test(float *input, float *output, int i, int N) {
    float c = 10;

    // NOTE: uncomment this will cause cuda-memcheck to give an error.
    // c = input[-1];

    if (i < 0 || i >= N || (c = input[-1])) {
        output[0] = c;
    }
}

int main(void) {

    int i = -1;
    int N = 10;

    float* input;
    float* output;
    float* dev_input;
    float* dev_output;

    input = (float*)malloc(sizeof(float) * N);
    output = (float*)malloc(sizeof(float));
    for (int j = 0; j < N; j++) {
      input[j] = 2.0f;
    }
    output[0] = 3.0f;

    cudaMalloc((void**)&dev_input,sizeof(float) * N);
    cudaMalloc((void**)&dev_output,sizeof(float));

    cudaMemcpy(dev_input,input,sizeof(float) * N,cudaMemcpyHostToDevice);
    cudaMemcpy(dev_output,output,sizeof(float),cudaMemcpyHostToDevice);

    test<<<1,1>>>(dev_input,dev_output,i,N);

    cudaMemcpy(output,dev_output,sizeof(float),cudaMemcpyDeviceToHost);

    printf("%f\n", output[0]);
    return 0;
}

Answer 1

尝试下面的简单代码，其中内核函数尝试访问input[-1]。你会发现它会卡住。

#include <stdio.h>

__global__ void test(float *input, float *output, int i, int N) {

    if (i < N || input[i] == 0) {
        output[i] = input[i];
    }
}

void main(void) {

    int i = -1;
    int N = 10;

    float* input;
    float* dev_input;
    float* dev_output;

    input = (float*)malloc(sizeof(float));
    input[0] = 2.f;

    cudaMalloc((void**)&dev_input,sizeof(float));
    cudaMalloc((void**)&dev_output,sizeof(float));

    cudaMemcpy(dev_input,input,sizeof(float),cudaMemcpyHostToDevice);

    test<<<1,1>>>(dev_input,dev_output,i,N);
}

可以通过查看反汇编代码来解释原因。

   MOV R1, c[0x1][0x100];                              R1 = c[0x1][0x100]
   NOP;
   MOV R3, c[0x0][0x28];                               R3 = c[0x0][0x28]
   SHL R2, R3, 0x2;                                    R2 = shiftleft(R3)           
   IADD R0, R2, c[0x0][0x20];                          R0 = R2 + 0x2
   LDU R0, [R0];                                       Load the memory addressed by R0 to R0
   FSETP.EQ.AND P0, PT, R0, RZ, PT;                    Predicate register P0 will contain result of test R0 == 0
   ISETP.LT.OR P0, PT, R3, c[0x0][0x2c], P0;           Predicate register P0 will contain result of test P0 || (R3 < c[0x0][0x2c])
@P0 IADD R2, R2, c[0x0][0x24];                         ...
@P0 ST [R2], R0;
   EXIT ;

如您所见，无论第一个子句的结果如何，设备都会尝试从全局内存加载数据。

Answer 2

CUDA C / C ++编译器在这方面应遵守the language requirements。

具体而言，应为非重载||和&&运营商维护语言要求，以及操作顺序和短路。

cuda代码是否跳过了逻辑表达式中不必要的子句的执行？

2 个答案: