在CUDA 6.5下循环展开时输出错误

时间:2014-09-08 21:25:15

标签: cuda

我有以下代码:

#include<stdio.h>

#define N_ITERATIONS 2048
#define UNROLL 32

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x ;

    if (tid < N) {

        int a = d_a[tid];
        int b = d_b[tid];
        int c = d_c[tid];

        #pragma unroll UNROLL
        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a = a * b + c;
        }

        d_a[tid] = a;
    }

}

/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/2) {

        int a1 = d_a[tid];
        int b1 = d_b[tid];
        int c1 = d_c[tid];

        int a2 = d_a[tid+N/2];
        int b2 = d_b[tid+N/2];
        int c2 = d_c[tid+N/2];

        #pragma unroll UNROLL
        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
        }

        d_a[tid]        = a1;
        d_a[tid+N/2]    = a2;
    }

}

/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/4) {

        int a1 = d_a[tid];
        int b1 = d_b[tid];
        int c1 = d_c[tid];

        int a2 = d_a[tid+N/4];
        int b2 = d_b[tid+N/4];
        int c2 = d_c[tid+N/4];

        int a3 = d_a[tid+N/2];
        int b3 = d_b[tid+N/2];
        int c3 = d_c[tid+N/2];

        int a4 = d_a[tid+3*N/4];
        int b4 = d_b[tid+3*N/4];
        int c4 = d_c[tid+3*N/4];

        #pragma unroll UNROLL
        for(unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            //if (tid==0) printf("iteration %i %i\n",i,a1);
            a2 = a2 * b2 + c2;
            a3 = a3 * b3 + c3;
            a4 = a4 * b4 + c4;
        }
        //if (tid==0) printf("last iteration %i\n",a1);

        d_a[tid]        = a1;
        d_a[tid+N/4]    = a2;
        d_a[tid+N/2]    = a3;
        d_a[tid+3*N/4]  = a4;
    }

}

/********/
/* MAIN */
/********/
void main() {

    const int N = 1024;

    int blockSize;      // The launch configurator returned block size 
    int minGridSize;    // The minimum grid size needed to achieve the maximum occupancy for a full device launch 

    int *h_a                = (int*)malloc(N*sizeof(int));
    int *h_a_result_host    = (int*)malloc(N*sizeof(int));
    int *h_a_result_device  = (int*)malloc(N*sizeof(int));
    int *h_b                = (int*)malloc(N*sizeof(int));
    int *h_c                = (int*)malloc(N*sizeof(int));

    for (int i=0; i<N; i++) {
        h_a[i] = 2;
        h_b[i] = 1;
        h_c[i] = 2;
        h_a_result_host[i] = h_a[i];
        for(unsigned int k = 0; k < N_ITERATIONS; k++) {
            h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
        }
    }

    int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
    int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
    int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));

    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));

    // --- Creating events for timing
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /***********/
    /* KERNEL2 */
    /***********/
    cudaEventRecord(start, 0);
    kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }

    cudaDeviceReset();

}

我正在使用CUDA 6.5为compute_20, sm_21(GT540M)编译它,它可以正常用于UNROLL = 2, 4, 8, 16,但它不再适用于UNROLL = 32,因为结果对于输出数组的第一个元素已经不正确了。

错误消息是:

Error at i=0! Host = 4098; Device = 4036

如果我监控tid = 0的处理,即我取消注释printf行,我会看到a1保持正确的值,直到最后for次迭代,而,在退出for循环后,变量立即变得不正确。换句话说,通过取消注释两个printf,输出为

....
iteration 2047 4098
last iteration 4036

表示for循环正确执行,但退出时,a1变量的值会发生变化。

如果我使用CUDA UNROLL = 325.5编译6.0的代码,它就会有效。

看一下反汇编的代码,我看到了

CUDA 5.5

非常类似于CUDA 6.0 - 此处未显示

CUDA 6.0

            Function : _Z7kernel2PiS_S_j
    .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];
    /*0008*/         NOP;
    /*0010*/         MOV R3, c[0x0][0x38];
    /*0018*/         S2R R0, SR_CTAID.X;
    /*0020*/         SHR.U32 R3, R3, 0x2;
    /*0028*/         S2R R2, SR_TID.X;
    /*0030*/         IMAD R15, R0, c[0x0][0x8], R2;
    /*0038*/         ISETP.GE.U32.AND P0, PT, R15, R3, PT;
    /*0040*/     @P0 EXIT;
    /*0048*/         MOV32I R17, 0x4;
    /*0050*/         IADD R7, R15, R3;
    /*0058*/         IMAD R8.CC, R15, R17, c[0x0][0x20];
    /*0060*/         IMAD.HI.X R9, R15, R17, c[0x0][0x24];
    /*0068*/         IMAD R20.CC, R15, R17, c[0x0][0x28];
    /*0070*/         LD.E R0, [R8];
    /*0078*/         IMAD.HI.X R21, R15, R17, c[0x0][0x2c];
    /*0080*/         IMAD R2.CC, R15, R17, c[0x0][0x30];
    /*0088*/         MOV R12, c[0x0][0x38];
    /*0090*/         IMAD.HI.X R3, R15, R17, c[0x0][0x34];
    /*0098*/         IMUL R13, R12, 0x3;
    /*00a0*/         IMAD.U32.U32 R10.CC, R7, R17, c[0x0][0x30];
    /*00a8*/         LD.E R14, [R2];
    /*00b0*/         IMAD.U32.U32.HI.X R11, R7, R17, c[0x0][0x34];
    /*00b8*/         LD.E R16, [R20];
    /*00c0*/         IMAD.U32.U32 R4.CC, R7, R17, c[0x0][0x20];
    /*00c8*/         SHR.U32 R3, R12, 0x1;
    /*00d0*/         IMAD.U32.U32.HI.X R5, R7, R17, c[0x0][0x24];
    /*00d8*/         IADD R20, R15, R3;
    /*00e0*/         IMAD.U32.U32 R6.CC, R7, R17, c[0x0][0x28];
    /*00e8*/         LD.E R18, [R4];
    /*00f0*/         IMAD.U32.U32.HI R13, R13, c[0x10][0x0], R15;
    /*00f8*/         LD.E R21, [R10];
    /*0100*/         IMAD.U32.U32.HI.X R7, R7, R17, c[0x0][0x2c];
    /*0108*/         IMAD.U32.U32 R8.CC, R13, R17, c[0x0][0x28];
    /*0110*/         LD.E R19, [R6];
    /*0118*/         IMAD.U32.U32.HI.X R9, R13, R17, c[0x0][0x2c];
    /*0120*/         IMAD.U32.U32 R2.CC, R13, R17, c[0x0][0x20];
    /*0128*/         LD.E R9, [R8];
    /*0130*/         IMAD.U32.U32.HI.X R3, R13, R17, c[0x0][0x24];
    /*0138*/         IMAD.U32.U32 R4.CC, R20, R17, c[0x0][0x28];
    /*0140*/         IMAD.U32.U32.HI.X R5, R20, R17, c[0x0][0x2c];
    /*0148*/         LD.E R8, [R2];
    /*0150*/         IMAD.U32.U32 R6.CC, R20, R17, c[0x0][0x30];
    /*0158*/         LD.E R5, [R4];
    /*0160*/         IMAD.U32.U32.HI.X R7, R20, R17, c[0x0][0x34];
    /*0168*/         MOV32I R4, 0xfffff800;
    /*0170*/         IMAD.U32.U32 R12.CC, R13, R17, c[0x0][0x30];
    /*0178*/         LD.E R6, [R6];
    /*0180*/         IMAD.U32.U32.HI.X R13, R13, R17, c[0x0][0x34];
    /*0188*/         IMAD.U32.U32 R10.CC, R20, R17, c[0x0][0x20];
    /*0190*/         LD.E R13, [R12];
    /*0198*/         IMAD.U32.U32.HI.X R11, R20, R17, c[0x0][0x24];
    /*01a0*/         LD.E R17, [R10];
    /*01a8*/         IMAD R0, R0, R16, R14;
    /*01b0*/         IMAD R7, R18, R19, R21;
    /*01b8*/         IMAD R12, R17, R5, R6;
    /*01c0*/         IMAD R8, R8, R9, R13;
    /*01c8*/         IMAD R0, R0, R16, R14;
    /*01d0*/         IMAD R7, R7, R19, R21;
    /*01d8*/         IMAD R12, R12, R5, R6;
    /*01e0*/         IMAD R8, R8, R9, R13;
    /*01e8*/         IMAD R0, R0, R16, R14;
    /*01f0*/         IMAD R7, R7, R19, R21;
    /*01f8*/         IMAD R12, R12, R5, R6;
    /*0200*/         IMAD R8, R8, R9, R13;
    /*0208*/         IMAD R0, R0, R16, R14;
    /*0210*/         IMAD R7, R7, R19, R21;
    /*0218*/         IMAD R12, R12, R5, R6;
    /*0220*/         IMAD R8, R8, R9, R13;
    /*0228*/         IMAD R0, R0, R16, R14;
    /*0230*/         IMAD R7, R7, R19, R21;
    /*0238*/         IMAD R12, R12, R5, R6;
    /*0240*/         IMAD R8, R8, R9, R13;
    /*0248*/         IMAD R0, R0, R16, R14;
    /*0250*/         IMAD R7, R7, R19, R21;
    /*0258*/         IMAD R12, R12, R5, R6;
    /*0260*/         IMAD R8, R8, R9, R13;
    /*0268*/         IMAD R0, R0, R16, R14;
    /*0270*/         IMAD R7, R7, R19, R21;
    /*0278*/         IMAD R12, R12, R5, R6;
    /*0280*/         IMAD R8, R8, R9, R13;
    /*0288*/         IMAD R0, R0, R16, R14;
    /*0290*/         IMAD R7, R7, R19, R21;
    /*0298*/         IMAD R12, R12, R5, R6;
    /*02a0*/         IMAD R8, R8, R9, R13;
    /*02a8*/         IMAD R0, R0, R16, R14;
    /*02b0*/         IMAD R7, R7, R19, R21;
    /*02b8*/         IMAD R12, R12, R5, R6;
    /*02c0*/         IMAD R8, R8, R9, R13;
    /*02c8*/         IMAD R0, R0, R16, R14;
    /*02d0*/         IMAD R7, R7, R19, R21;
    /*02d8*/         IMAD R12, R12, R5, R6;
    /*02e0*/         IMAD R8, R8, R9, R13;
    /*02e8*/         IMAD R0, R0, R16, R14;
    /*02f0*/         IMAD R7, R7, R19, R21;
    /*02f8*/         IMAD R12, R12, R5, R6;
    /*0300*/         IMAD R8, R8, R9, R13;
    /*0308*/         IMAD R0, R0, R16, R14;
    /*0310*/         IMAD R7, R7, R19, R21;
    /*0318*/         IMAD R12, R12, R5, R6;
    /*0320*/         IMAD R8, R8, R9, R13;
    /*0328*/         IMAD R0, R0, R16, R14;
    /*0330*/         IMAD R7, R7, R19, R21;
    /*0338*/         IMAD R12, R12, R5, R6;
    /*0340*/         IMAD R8, R8, R9, R13;
    /*0348*/         IMAD R0, R0, R16, R14;
    /*0350*/         IMAD R7, R7, R19, R21;
    /*0358*/         IMAD R12, R12, R5, R6;
    /*0360*/         IMAD R8, R8, R9, R13;
    /*0368*/         IMAD R0, R0, R16, R14;
    /*0370*/         IMAD R7, R7, R19, R21;
    /*0378*/         IMAD R12, R12, R5, R6;
    /*0380*/         IMAD R8, R8, R9, R13;
    /*0388*/         IMAD R0, R0, R16, R14;
    /*0390*/         IMAD R7, R7, R19, R21;
    /*0398*/         IMAD R12, R12, R5, R6;
    /*03a0*/         IMAD R8, R8, R9, R13;
    /*03a8*/         IMAD R0, R0, R16, R14;
    /*03b0*/         IMAD R7, R7, R19, R21;
    /*03b8*/         IMAD R12, R12, R5, R6;
    /*03c0*/         IMAD R8, R8, R9, R13;
    /*03c8*/         IMAD R0, R0, R16, R14;
    /*03d0*/         IMAD R7, R7, R19, R21;
    /*03d8*/         IMAD R12, R12, R5, R6;
    /*03e0*/         IMAD R8, R8, R9, R13;
    /*03e8*/         IMAD R0, R0, R16, R14;
    /*03f0*/         IMAD R7, R7, R19, R21;
    /*03f8*/         IMAD R12, R12, R5, R6;
    /*0400*/         IMAD R8, R8, R9, R13;
    /*0408*/         IMAD R0, R0, R16, R14;
    /*0410*/         IMAD R7, R7, R19, R21;
    /*0418*/         IMAD R12, R12, R5, R6;
    /*0420*/         IMAD R8, R8, R9, R13;
    /*0428*/         IMAD R0, R0, R16, R14;
    /*0430*/         IMAD R7, R7, R19, R21;
    /*0438*/         IMAD R12, R12, R5, R6;
    /*0440*/         IMAD R8, R8, R9, R13;
    /*0448*/         IMAD R0, R0, R16, R14;
    /*0450*/         IMAD R7, R7, R19, R21;
    /*0458*/         IMAD R12, R12, R5, R6;
    /*0460*/         IMAD R8, R8, R9, R13;
    /*0468*/         IMAD R0, R0, R16, R14;
    /*0470*/         IMAD R7, R7, R19, R21;
    /*0478*/         IMAD R12, R12, R5, R6;
    /*0480*/         IMAD R8, R8, R9, R13;
    /*0488*/         IMAD R0, R0, R16, R14;
    /*0490*/         IMAD R7, R7, R19, R21;
    /*0498*/         IMAD R12, R12, R5, R6;
    /*04a0*/         IMAD R8, R8, R9, R13;
    /*04a8*/         IMAD R0, R0, R16, R14;
    /*04b0*/         IMAD R7, R7, R19, R21;
    /*04b8*/         IMAD R12, R12, R5, R6;
    /*04c0*/         IMAD R8, R8, R9, R13;
    /*04c8*/         IMAD R0, R0, R16, R14;
    /*04d0*/         IMAD R7, R7, R19, R21;
    /*04d8*/         IMAD R12, R12, R5, R6;
    /*04e0*/         IMAD R8, R8, R9, R13;
    /*04e8*/         IMAD R0, R0, R16, R14;
    /*04f0*/         IMAD R7, R7, R19, R21;
    /*04f8*/         IMAD R12, R12, R5, R6;
    /*0500*/         IMAD R8, R8, R9, R13;
    /*0508*/         IMAD R0, R0, R16, R14;
    /*0510*/         IMAD R7, R7, R19, R21;
    /*0518*/         IMAD R12, R12, R5, R6;
    /*0520*/         IMAD R8, R8, R9, R13;
    /*0528*/         IMAD R0, R0, R16, R14;
    /*0530*/         IMAD R7, R7, R19, R21;
    /*0538*/         IMAD R12, R12, R5, R6;
    /*0540*/         IMAD R8, R8, R9, R13;
    /*0548*/         IADD R4, R4, 0x20;
    /*0550*/         IMAD R0, R0, R16, R14;
    /*0558*/         IMAD R7, R7, R19, R21;
    /*0560*/         IMAD R12, R12, R5, R6;
    /*0568*/         IMAD R8, R8, R9, R13;
    /*0570*/         ISETP.NE.AND P0, PT, R4, RZ, PT;
    /*0578*/         IMAD R0, R0, R16, R14;
    /*0580*/         IMAD R18, R7, R19, R21;
    /*0588*/         IMAD R17, R12, R5, R6;
    /*0590*/         IMAD R8, R8, R9, R13;
    /*0598*/         IMAD R0, R0, R16, R14;
    /*05a0*/         IMAD R18, R18, R19, R21;
    /*05a8*/         IMAD R17, R17, R5, R6;
    /*05b0*/         IMAD R8, R8, R9, R13;
    /*05b8*/     @P0 BRA 0x1a8;
    /*05c0*/         MOV32I R4, 0x40000000;
    /*05c8*/         MOV32I R9, 0x4;
    /*05d0*/         IMAD.U32.U32.HI R7, R4, c[0x0][0x38], R15;
    /*05d8*/         IMAD R4.CC, R15, R9, c[0x0][0x20];
    /*05e0*/         IMAD.HI.X R5, R15, R9, c[0x0][0x24];
    /*05e8*/         IMAD.U32.U32 R6.CC, R7, R9, c[0x0][0x20];
    /*05f0*/         IMAD.U32.U32.HI.X R7, R7, R9, c[0x0][0x24];
    /*05f8*/         ST.E [R4], R0;
    /*0600*/         ST.E [R6], R18;
    /*0608*/         ST.E [R10], R17;
    /*0610*/         ST.E [R2], R8;
    /*0618*/         EXIT;

CUDA 6.5

            Function : _Z7kernel2PiS_S_j
    .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];
    /*0008*/         NOP;
    /*0010*/         MOV R3, c[0x0][0x38];
    /*0018*/         S2R R0, SR_CTAID.X;
    /*0020*/         SHR.U32 R3, R3, 0x2;
    /*0028*/         S2R R2, SR_TID.X;
    /*0030*/         IMAD R0, R0, c[0x0][0x8], R2;
    /*0038*/         ISETP.GE.U32.AND P0, PT, R0, R3, PT;
    /*0040*/     @P0 EXIT;
    /*0048*/         MOV32I R14, 0x4;
    /*0050*/         IADD R21, R0, R3;
    /*0058*/         IMAD R18.CC, R0, R14, c[0x0][0x20];
    /*0060*/         IMAD.HI.X R19, R0, R14, c[0x0][0x24];
    /*0068*/         IMAD R10.CC, R0, R14, c[0x0][0x28];
    /*0070*/         LD.E R15, [R18];
    /*0078*/         IMAD.HI.X R11, R0, R14, c[0x0][0x2c];
    /*0080*/         IMAD R12.CC, R0, R14, c[0x0][0x30];
    /*0088*/         MOV R22, c[0x0][0x38];
    /*0090*/         IMAD.HI.X R13, R0, R14, c[0x0][0x34];
    /*0098*/         IMUL R2, R22, 0x3;
    /*00a0*/         IMAD.U32.U32 R8.CC, R21, R14, c[0x0][0x20];
    /*00a8*/         LD.E R17, [R10];
    /*00b0*/         IMAD.U32.U32.HI.X R9, R21, R14, c[0x0][0x24];
    /*00b8*/         LD.E R20, [R12];
    /*00c0*/         IMAD.U32.U32 R4.CC, R21, R14, c[0x0][0x28];
    /*00c8*/         SHR.U32 R13, R22, 0x1;
    /*00d0*/         IMAD.U32.U32.HI R16, R2, c[0x10][0x0], R0;
    /*00d8*/         LD.E R23, [R8];
    /*00e0*/         IMAD.U32.U32.HI.X R5, R21, R14, c[0x0][0x2c];
    /*00e8*/         IADD R19, R0, R13;
    /*00f0*/         IMAD.U32.U32 R2.CC, R16, R14, c[0x0][0x28];
    /*00f8*/         LD.E R22, [R4];
    /*0100*/         IMAD.U32.U32.HI.X R3, R16, R14, c[0x0][0x2c];
    /*0108*/         IMAD.U32.U32 R6.CC, R16, R14, c[0x0][0x20];
    /*0110*/         LD.E R2, [R2];
    /*0118*/         IMAD.U32.U32.HI.X R7, R16, R14, c[0x0][0x24];
    /*0120*/         MOV32I R3, 0xfffff800;
    /*0128*/         IMAD.U32.U32 R10.CC, R21, R14, c[0x0][0x30];
    /*0130*/         IMAD.U32.U32.HI.X R11, R21, R14, c[0x0][0x34];
    /*0138*/         IMAD.U32.U32 R12.CC, R16, R14, c[0x0][0x30];
    /*0140*/         IMAD.U32.U32.HI.X R13, R16, R14, c[0x0][0x34];
    /*0148*/         LD.E R10, [R10];
    /*0150*/         IMAD.U32.U32 R8.CC, R19, R14, c[0x0][0x28];
    /*0158*/         LD.E R16, [R6];
    /*0160*/         IMAD.U32.U32.HI.X R9, R19, R14, c[0x0][0x2c];
    /*0168*/         LD.E R12, [R12];
    /*0170*/         IMAD.U32.U32 R4.CC, R19, R14, c[0x0][0x30];
    /*0178*/         LD.E R8, [R8];
    /*0180*/         IMAD.U32.U32.HI.X R5, R19, R14, c[0x0][0x34];
    /*0188*/         IMAD.U32.U32 R18.CC, R19, R14, c[0x0][0x20];
    /*0190*/         LD.E R4, [R4];
    /*0198*/         IMAD.U32.U32.HI.X R19, R19, R14, c[0x0][0x24];
    /*01a0*/         LD.E R14, [R18];
    /*01a8*/         IMAD R5, R15, R17, R20;
    /*01b0*/         IMAD R9, R23, R22, R10;
    /*01b8*/         IMAD R11, R14, R8, R4;
    /*01c0*/         IMAD R13, R16, R2, R12;
    /*01c8*/         IMAD R15, R5, R17, R20;
    /*01d0*/         IMAD R21, R9, R22, R10;
    /*01d8*/         IMAD R14, R11, R8, R4;
    /*01e0*/         IMAD R16, R13, R2, R12;
    /*01e8*/         IMAD R15, R15, R17, R20;
    /*01f0*/         IMAD R21, R21, R22, R10;
    /*01f8*/         IMAD R14, R14, R8, R4;
    /*0200*/         IMAD R16, R16, R2, R12;
    /*0208*/         IMAD R15, R15, R17, R20;
    /*0210*/         IMAD R21, R21, R22, R10;
    /*0218*/         IMAD R14, R14, R8, R4;
    /*0220*/         IMAD R16, R16, R2, R12;
    /*0228*/         IMAD R15, R15, R17, R20;
    /*0230*/         IMAD R21, R21, R22, R10;
    /*0238*/         IMAD R14, R14, R8, R4;
    /*0240*/         IMAD R16, R16, R2, R12;
    /*0248*/         IMAD R15, R15, R17, R20;
    /*0250*/         IMAD R21, R21, R22, R10;
    /*0258*/         IMAD R14, R14, R8, R4;
    /*0260*/         IMAD R16, R16, R2, R12;
    /*0268*/         IMAD R15, R15, R17, R20;
    /*0270*/         IMAD R21, R21, R22, R10;
    /*0278*/         IMAD R14, R14, R8, R4;
    /*0280*/         IMAD R16, R16, R2, R12;
    /*0288*/         IMAD R15, R15, R17, R20;
    /*0290*/         IMAD R21, R21, R22, R10;
    /*0298*/         IMAD R14, R14, R8, R4;
    /*02a0*/         IMAD R16, R16, R2, R12;
    /*02a8*/         IMAD R15, R15, R17, R20;
    /*02b0*/         IMAD R21, R21, R22, R10;
    /*02b8*/         IMAD R14, R14, R8, R4;
    /*02c0*/         IMAD R16, R16, R2, R12;
    /*02c8*/         IMAD R15, R15, R17, R20;
    /*02d0*/         IMAD R21, R21, R22, R10;
    /*02d8*/         IMAD R14, R14, R8, R4;
    /*02e0*/         IMAD R16, R16, R2, R12;
    /*02e8*/         IMAD R15, R15, R17, R20;
    /*02f0*/         IMAD R21, R21, R22, R10;
    /*02f8*/         IMAD R14, R14, R8, R4;
    /*0300*/         IMAD R16, R16, R2, R12;
    /*0308*/         IMAD R15, R15, R17, R20;
    /*0310*/         IMAD R21, R21, R22, R10;
    /*0318*/         IMAD R14, R14, R8, R4;
    /*0320*/         IMAD R16, R16, R2, R12;
    /*0328*/         IMAD R15, R15, R17, R20;
    /*0330*/         IMAD R21, R21, R22, R10;
    /*0338*/         IMAD R14, R14, R8, R4;
    /*0340*/         IMAD R16, R16, R2, R12;
    /*0348*/         IMAD R15, R15, R17, R20;
    /*0350*/         IMAD R21, R21, R22, R10;
    /*0358*/         IMAD R14, R14, R8, R4;
    /*0360*/         IMAD R16, R16, R2, R12;
    /*0368*/         IMAD R15, R15, R17, R20;
    /*0370*/         IMAD R21, R21, R22, R10;
    /*0378*/         IMAD R14, R14, R8, R4;
    /*0380*/         IMAD R16, R16, R2, R12;
    /*0388*/         IMAD R15, R15, R17, R20;
    /*0390*/         IMAD R21, R21, R22, R10;
    /*0398*/         IMAD R14, R14, R8, R4;
    /*03a0*/         IMAD R16, R16, R2, R12;
    /*03a8*/         IMAD R15, R15, R17, R20;
    /*03b0*/         IMAD R21, R21, R22, R10;
    /*03b8*/         IMAD R14, R14, R8, R4;
    /*03c0*/         IMAD R16, R16, R2, R12;
    /*03c8*/         IMAD R15, R15, R17, R20;
    /*03d0*/         IMAD R21, R21, R22, R10;
    /*03d8*/         IMAD R14, R14, R8, R4;
    /*03e0*/         IMAD R16, R16, R2, R12;
    /*03e8*/         IMAD R15, R15, R17, R20;
    /*03f0*/         IMAD R21, R21, R22, R10;
    /*03f8*/         IMAD R14, R14, R8, R4;
    /*0400*/         IMAD R16, R16, R2, R12;
    /*0408*/         IMAD R15, R15, R17, R20;
    /*0410*/         IMAD R21, R21, R22, R10;
    /*0418*/         IMAD R14, R14, R8, R4;
    /*0420*/         IMAD R16, R16, R2, R12;
    /*0428*/         IMAD R15, R15, R17, R20;
    /*0430*/         IMAD R21, R21, R22, R10;
    /*0438*/         IMAD R14, R14, R8, R4;
    /*0440*/         IMAD R16, R16, R2, R12;
    /*0448*/         IMAD R15, R15, R17, R20;
    /*0450*/         IMAD R21, R21, R22, R10;
    /*0458*/         IMAD R14, R14, R8, R4;
    /*0460*/         IMAD R16, R16, R2, R12;
    /*0468*/         IMAD R15, R15, R17, R20;
    /*0470*/         IMAD R21, R21, R22, R10;
    /*0478*/         IMAD R14, R14, R8, R4;
    /*0480*/         IMAD R16, R16, R2, R12;
    /*0488*/         IMAD R15, R15, R17, R20;
    /*0490*/         IMAD R21, R21, R22, R10;
    /*0498*/         IMAD R14, R14, R8, R4;
    /*04a0*/         IMAD R16, R16, R2, R12;
    /*04a8*/         IMAD R15, R15, R17, R20;
    /*04b0*/         IMAD R21, R21, R22, R10;
    /*04b8*/         IMAD R14, R14, R8, R4;
    /*04c0*/         IMAD R16, R16, R2, R12;
    /*04c8*/         IMAD R15, R15, R17, R20;
    /*04d0*/         IMAD R21, R21, R22, R10;
    /*04d8*/         IMAD R14, R14, R8, R4;
    /*04e0*/         IMAD R16, R16, R2, R12;
    /*04e8*/         IMAD R15, R15, R17, R20;
    /*04f0*/         IMAD R21, R21, R22, R10;
    /*04f8*/         IMAD R14, R14, R8, R4;
    /*0500*/         IMAD R16, R16, R2, R12;
    /*0508*/         IMAD R15, R15, R17, R20;
    /*0510*/         IMAD R21, R21, R22, R10;
    /*0518*/         IMAD R14, R14, R8, R4;
    /*0520*/         IMAD R16, R16, R2, R12;
    /*0528*/         IMAD R15, R15, R17, R20;
    /*0530*/         IMAD R21, R21, R22, R10;
    /*0538*/         IMAD R14, R14, R8, R4;
    /*0540*/         IMAD R16, R16, R2, R12;
    /*0548*/         IADD R3, R3, 0x20;
    /*0550*/         IMAD R15, R15, R17, R20;
    /*0558*/         IMAD R21, R21, R22, R10;
    /*0560*/         IMAD R14, R14, R8, R4;
    /*0568*/         IMAD R16, R16, R2, R12;
    /*0570*/         ISETP.NE.AND P0, PT, R3, RZ, PT;
    /*0578*/         IMAD R15, R15, R17, R20;
    /*0580*/         IMAD R23, R21, R22, R10;
    /*0588*/         IMAD R14, R14, R8, R4;
    /*0590*/         IMAD R16, R16, R2, R12;
    /*0598*/         IMAD R15, R15, R17, R20;
    /*05a0*/         IMAD R23, R23, R22, R10;
    /*05a8*/         IMAD R14, R14, R8, R4;
    /*05b0*/         IMAD R16, R16, R2, R12;
    /*05b8*/     @P0 BRA 0x1a8;
    /*05c0*/         MOV32I R2, 0x40000000;
    /*05c8*/         MOV32I R8, 0x4;
    /*05d0*/         IMAD.U32.U32.HI R4, R2, c[0x0][0x38], R0;
    /*05d8*/         IMAD R2.CC, R0, R8, c[0x0][0x20];
    /*05e0*/         IMAD.HI.X R3, R0, R8, c[0x0][0x24];
    /*05e8*/         IMAD.U32.U32 R14.CC, R4, R8, c[0x0][0x20];
    /*05f0*/         IMAD.U32.U32.HI.X R15, R4, R8, c[0x0][0x24];
    /*05f8*/         ST.E [R2], R5;
    /*0600*/         ST.E [R14], R9;
    /*0608*/         ST.E [R18], R11;
    /*0610*/         ST.E [R6], R13;
    /*0618*/         EXIT;

CUDA ST5.5的{​​{1}}操作适用于6.0R10R18R17由最后R8个更新,而对于CUDA 6.5,它们在IMADR5R9R11上运行R13

我做错了什么?

我的设置:使用Windows 7的核心i7笔记本电脑,编译一个发布项目,IMAD - 位或32 - 位(同样的问题)。以上是指64 - 位。

命令行:

64

修改

仅在存在两个额外内核时才会出现此问题。如果我发表评论,一切正常。工作和非工作案例的反汇编代码分别报告herehere

1 个答案:

答案 0 :(得分:1)

我已确认问题已在CUDA 7 EA中修复。当CUDA 7 RC或CUDA 7生产版本可用时,问题也应该在那里修复。