我有以下代码:
#include<stdio.h>
#define N_ITERATIONS 2048
#define UNROLL 32
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if (tid < N) {
int a = d_a[tid];
int b = d_b[tid];
int c = d_c[tid];
#pragma unroll UNROLL
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/2];
int b2 = d_b[tid+N/2];
int c2 = d_c[tid+N/2];
#pragma unroll UNROLL
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid+N/2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/4) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/4];
int b2 = d_b[tid+N/4];
int c2 = d_c[tid+N/4];
int a3 = d_a[tid+N/2];
int b3 = d_b[tid+N/2];
int c3 = d_c[tid+N/2];
int a4 = d_a[tid+3*N/4];
int b4 = d_b[tid+3*N/4];
int c4 = d_c[tid+3*N/4];
#pragma unroll UNROLL
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
//if (tid==0) printf("iteration %i %i\n",i,a1);
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
//if (tid==0) printf("last iteration %i\n",a1);
d_a[tid] = a1;
d_a[tid+N/4] = a2;
d_a[tid+N/2] = a3;
d_a[tid+3*N/4] = a4;
}
}
/********/
/* MAIN */
/********/
void main() {
const int N = 1024;
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch
int *h_a = (int*)malloc(N*sizeof(int));
int *h_a_result_host = (int*)malloc(N*sizeof(int));
int *h_a_result_device = (int*)malloc(N*sizeof(int));
int *h_b = (int*)malloc(N*sizeof(int));
int *h_c = (int*)malloc(N*sizeof(int));
for (int i=0; i<N; i++) {
h_a[i] = 2;
h_b[i] = 1;
h_c[i] = 2;
h_a_result_host[i] = h_a[i];
for(unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* KERNEL2 */
/***********/
cudaEventRecord(start, 0);
kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
cudaDeviceReset();
}
我正在使用CUDA 6.5为compute_20, sm_21
(GT540M)编译它,它可以正常用于UNROLL = 2, 4, 8, 16
,但它不再适用于UNROLL = 32
,因为结果对于输出数组的第一个元素已经不正确了。
错误消息是:
Error at i=0! Host = 4098; Device = 4036
如果我监控tid = 0
的处理,即我取消注释printf
行,我会看到a1
保持正确的值,直到最后for
次迭代,而,在退出for
循环后,变量立即变得不正确。换句话说,通过取消注释两个printf
,输出为
....
iteration 2047 4098
last iteration 4036
表示for
循环正确执行,但退出时,a1
变量的值会发生变化。
如果我使用CUDA UNROLL = 32
和5.5
编译6.0
的代码,它就会有效。
看一下反汇编的代码,我看到了
CUDA 5.5
非常类似于CUDA 6.0
- 此处未显示
CUDA 6.0
Function : _Z7kernel2PiS_S_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ NOP;
/*0010*/ MOV R3, c[0x0][0x38];
/*0018*/ S2R R0, SR_CTAID.X;
/*0020*/ SHR.U32 R3, R3, 0x2;
/*0028*/ S2R R2, SR_TID.X;
/*0030*/ IMAD R15, R0, c[0x0][0x8], R2;
/*0038*/ ISETP.GE.U32.AND P0, PT, R15, R3, PT;
/*0040*/ @P0 EXIT;
/*0048*/ MOV32I R17, 0x4;
/*0050*/ IADD R7, R15, R3;
/*0058*/ IMAD R8.CC, R15, R17, c[0x0][0x20];
/*0060*/ IMAD.HI.X R9, R15, R17, c[0x0][0x24];
/*0068*/ IMAD R20.CC, R15, R17, c[0x0][0x28];
/*0070*/ LD.E R0, [R8];
/*0078*/ IMAD.HI.X R21, R15, R17, c[0x0][0x2c];
/*0080*/ IMAD R2.CC, R15, R17, c[0x0][0x30];
/*0088*/ MOV R12, c[0x0][0x38];
/*0090*/ IMAD.HI.X R3, R15, R17, c[0x0][0x34];
/*0098*/ IMUL R13, R12, 0x3;
/*00a0*/ IMAD.U32.U32 R10.CC, R7, R17, c[0x0][0x30];
/*00a8*/ LD.E R14, [R2];
/*00b0*/ IMAD.U32.U32.HI.X R11, R7, R17, c[0x0][0x34];
/*00b8*/ LD.E R16, [R20];
/*00c0*/ IMAD.U32.U32 R4.CC, R7, R17, c[0x0][0x20];
/*00c8*/ SHR.U32 R3, R12, 0x1;
/*00d0*/ IMAD.U32.U32.HI.X R5, R7, R17, c[0x0][0x24];
/*00d8*/ IADD R20, R15, R3;
/*00e0*/ IMAD.U32.U32 R6.CC, R7, R17, c[0x0][0x28];
/*00e8*/ LD.E R18, [R4];
/*00f0*/ IMAD.U32.U32.HI R13, R13, c[0x10][0x0], R15;
/*00f8*/ LD.E R21, [R10];
/*0100*/ IMAD.U32.U32.HI.X R7, R7, R17, c[0x0][0x2c];
/*0108*/ IMAD.U32.U32 R8.CC, R13, R17, c[0x0][0x28];
/*0110*/ LD.E R19, [R6];
/*0118*/ IMAD.U32.U32.HI.X R9, R13, R17, c[0x0][0x2c];
/*0120*/ IMAD.U32.U32 R2.CC, R13, R17, c[0x0][0x20];
/*0128*/ LD.E R9, [R8];
/*0130*/ IMAD.U32.U32.HI.X R3, R13, R17, c[0x0][0x24];
/*0138*/ IMAD.U32.U32 R4.CC, R20, R17, c[0x0][0x28];
/*0140*/ IMAD.U32.U32.HI.X R5, R20, R17, c[0x0][0x2c];
/*0148*/ LD.E R8, [R2];
/*0150*/ IMAD.U32.U32 R6.CC, R20, R17, c[0x0][0x30];
/*0158*/ LD.E R5, [R4];
/*0160*/ IMAD.U32.U32.HI.X R7, R20, R17, c[0x0][0x34];
/*0168*/ MOV32I R4, 0xfffff800;
/*0170*/ IMAD.U32.U32 R12.CC, R13, R17, c[0x0][0x30];
/*0178*/ LD.E R6, [R6];
/*0180*/ IMAD.U32.U32.HI.X R13, R13, R17, c[0x0][0x34];
/*0188*/ IMAD.U32.U32 R10.CC, R20, R17, c[0x0][0x20];
/*0190*/ LD.E R13, [R12];
/*0198*/ IMAD.U32.U32.HI.X R11, R20, R17, c[0x0][0x24];
/*01a0*/ LD.E R17, [R10];
/*01a8*/ IMAD R0, R0, R16, R14;
/*01b0*/ IMAD R7, R18, R19, R21;
/*01b8*/ IMAD R12, R17, R5, R6;
/*01c0*/ IMAD R8, R8, R9, R13;
/*01c8*/ IMAD R0, R0, R16, R14;
/*01d0*/ IMAD R7, R7, R19, R21;
/*01d8*/ IMAD R12, R12, R5, R6;
/*01e0*/ IMAD R8, R8, R9, R13;
/*01e8*/ IMAD R0, R0, R16, R14;
/*01f0*/ IMAD R7, R7, R19, R21;
/*01f8*/ IMAD R12, R12, R5, R6;
/*0200*/ IMAD R8, R8, R9, R13;
/*0208*/ IMAD R0, R0, R16, R14;
/*0210*/ IMAD R7, R7, R19, R21;
/*0218*/ IMAD R12, R12, R5, R6;
/*0220*/ IMAD R8, R8, R9, R13;
/*0228*/ IMAD R0, R0, R16, R14;
/*0230*/ IMAD R7, R7, R19, R21;
/*0238*/ IMAD R12, R12, R5, R6;
/*0240*/ IMAD R8, R8, R9, R13;
/*0248*/ IMAD R0, R0, R16, R14;
/*0250*/ IMAD R7, R7, R19, R21;
/*0258*/ IMAD R12, R12, R5, R6;
/*0260*/ IMAD R8, R8, R9, R13;
/*0268*/ IMAD R0, R0, R16, R14;
/*0270*/ IMAD R7, R7, R19, R21;
/*0278*/ IMAD R12, R12, R5, R6;
/*0280*/ IMAD R8, R8, R9, R13;
/*0288*/ IMAD R0, R0, R16, R14;
/*0290*/ IMAD R7, R7, R19, R21;
/*0298*/ IMAD R12, R12, R5, R6;
/*02a0*/ IMAD R8, R8, R9, R13;
/*02a8*/ IMAD R0, R0, R16, R14;
/*02b0*/ IMAD R7, R7, R19, R21;
/*02b8*/ IMAD R12, R12, R5, R6;
/*02c0*/ IMAD R8, R8, R9, R13;
/*02c8*/ IMAD R0, R0, R16, R14;
/*02d0*/ IMAD R7, R7, R19, R21;
/*02d8*/ IMAD R12, R12, R5, R6;
/*02e0*/ IMAD R8, R8, R9, R13;
/*02e8*/ IMAD R0, R0, R16, R14;
/*02f0*/ IMAD R7, R7, R19, R21;
/*02f8*/ IMAD R12, R12, R5, R6;
/*0300*/ IMAD R8, R8, R9, R13;
/*0308*/ IMAD R0, R0, R16, R14;
/*0310*/ IMAD R7, R7, R19, R21;
/*0318*/ IMAD R12, R12, R5, R6;
/*0320*/ IMAD R8, R8, R9, R13;
/*0328*/ IMAD R0, R0, R16, R14;
/*0330*/ IMAD R7, R7, R19, R21;
/*0338*/ IMAD R12, R12, R5, R6;
/*0340*/ IMAD R8, R8, R9, R13;
/*0348*/ IMAD R0, R0, R16, R14;
/*0350*/ IMAD R7, R7, R19, R21;
/*0358*/ IMAD R12, R12, R5, R6;
/*0360*/ IMAD R8, R8, R9, R13;
/*0368*/ IMAD R0, R0, R16, R14;
/*0370*/ IMAD R7, R7, R19, R21;
/*0378*/ IMAD R12, R12, R5, R6;
/*0380*/ IMAD R8, R8, R9, R13;
/*0388*/ IMAD R0, R0, R16, R14;
/*0390*/ IMAD R7, R7, R19, R21;
/*0398*/ IMAD R12, R12, R5, R6;
/*03a0*/ IMAD R8, R8, R9, R13;
/*03a8*/ IMAD R0, R0, R16, R14;
/*03b0*/ IMAD R7, R7, R19, R21;
/*03b8*/ IMAD R12, R12, R5, R6;
/*03c0*/ IMAD R8, R8, R9, R13;
/*03c8*/ IMAD R0, R0, R16, R14;
/*03d0*/ IMAD R7, R7, R19, R21;
/*03d8*/ IMAD R12, R12, R5, R6;
/*03e0*/ IMAD R8, R8, R9, R13;
/*03e8*/ IMAD R0, R0, R16, R14;
/*03f0*/ IMAD R7, R7, R19, R21;
/*03f8*/ IMAD R12, R12, R5, R6;
/*0400*/ IMAD R8, R8, R9, R13;
/*0408*/ IMAD R0, R0, R16, R14;
/*0410*/ IMAD R7, R7, R19, R21;
/*0418*/ IMAD R12, R12, R5, R6;
/*0420*/ IMAD R8, R8, R9, R13;
/*0428*/ IMAD R0, R0, R16, R14;
/*0430*/ IMAD R7, R7, R19, R21;
/*0438*/ IMAD R12, R12, R5, R6;
/*0440*/ IMAD R8, R8, R9, R13;
/*0448*/ IMAD R0, R0, R16, R14;
/*0450*/ IMAD R7, R7, R19, R21;
/*0458*/ IMAD R12, R12, R5, R6;
/*0460*/ IMAD R8, R8, R9, R13;
/*0468*/ IMAD R0, R0, R16, R14;
/*0470*/ IMAD R7, R7, R19, R21;
/*0478*/ IMAD R12, R12, R5, R6;
/*0480*/ IMAD R8, R8, R9, R13;
/*0488*/ IMAD R0, R0, R16, R14;
/*0490*/ IMAD R7, R7, R19, R21;
/*0498*/ IMAD R12, R12, R5, R6;
/*04a0*/ IMAD R8, R8, R9, R13;
/*04a8*/ IMAD R0, R0, R16, R14;
/*04b0*/ IMAD R7, R7, R19, R21;
/*04b8*/ IMAD R12, R12, R5, R6;
/*04c0*/ IMAD R8, R8, R9, R13;
/*04c8*/ IMAD R0, R0, R16, R14;
/*04d0*/ IMAD R7, R7, R19, R21;
/*04d8*/ IMAD R12, R12, R5, R6;
/*04e0*/ IMAD R8, R8, R9, R13;
/*04e8*/ IMAD R0, R0, R16, R14;
/*04f0*/ IMAD R7, R7, R19, R21;
/*04f8*/ IMAD R12, R12, R5, R6;
/*0500*/ IMAD R8, R8, R9, R13;
/*0508*/ IMAD R0, R0, R16, R14;
/*0510*/ IMAD R7, R7, R19, R21;
/*0518*/ IMAD R12, R12, R5, R6;
/*0520*/ IMAD R8, R8, R9, R13;
/*0528*/ IMAD R0, R0, R16, R14;
/*0530*/ IMAD R7, R7, R19, R21;
/*0538*/ IMAD R12, R12, R5, R6;
/*0540*/ IMAD R8, R8, R9, R13;
/*0548*/ IADD R4, R4, 0x20;
/*0550*/ IMAD R0, R0, R16, R14;
/*0558*/ IMAD R7, R7, R19, R21;
/*0560*/ IMAD R12, R12, R5, R6;
/*0568*/ IMAD R8, R8, R9, R13;
/*0570*/ ISETP.NE.AND P0, PT, R4, RZ, PT;
/*0578*/ IMAD R0, R0, R16, R14;
/*0580*/ IMAD R18, R7, R19, R21;
/*0588*/ IMAD R17, R12, R5, R6;
/*0590*/ IMAD R8, R8, R9, R13;
/*0598*/ IMAD R0, R0, R16, R14;
/*05a0*/ IMAD R18, R18, R19, R21;
/*05a8*/ IMAD R17, R17, R5, R6;
/*05b0*/ IMAD R8, R8, R9, R13;
/*05b8*/ @P0 BRA 0x1a8;
/*05c0*/ MOV32I R4, 0x40000000;
/*05c8*/ MOV32I R9, 0x4;
/*05d0*/ IMAD.U32.U32.HI R7, R4, c[0x0][0x38], R15;
/*05d8*/ IMAD R4.CC, R15, R9, c[0x0][0x20];
/*05e0*/ IMAD.HI.X R5, R15, R9, c[0x0][0x24];
/*05e8*/ IMAD.U32.U32 R6.CC, R7, R9, c[0x0][0x20];
/*05f0*/ IMAD.U32.U32.HI.X R7, R7, R9, c[0x0][0x24];
/*05f8*/ ST.E [R4], R0;
/*0600*/ ST.E [R6], R18;
/*0608*/ ST.E [R10], R17;
/*0610*/ ST.E [R2], R8;
/*0618*/ EXIT;
CUDA 6.5
Function : _Z7kernel2PiS_S_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ NOP;
/*0010*/ MOV R3, c[0x0][0x38];
/*0018*/ S2R R0, SR_CTAID.X;
/*0020*/ SHR.U32 R3, R3, 0x2;
/*0028*/ S2R R2, SR_TID.X;
/*0030*/ IMAD R0, R0, c[0x0][0x8], R2;
/*0038*/ ISETP.GE.U32.AND P0, PT, R0, R3, PT;
/*0040*/ @P0 EXIT;
/*0048*/ MOV32I R14, 0x4;
/*0050*/ IADD R21, R0, R3;
/*0058*/ IMAD R18.CC, R0, R14, c[0x0][0x20];
/*0060*/ IMAD.HI.X R19, R0, R14, c[0x0][0x24];
/*0068*/ IMAD R10.CC, R0, R14, c[0x0][0x28];
/*0070*/ LD.E R15, [R18];
/*0078*/ IMAD.HI.X R11, R0, R14, c[0x0][0x2c];
/*0080*/ IMAD R12.CC, R0, R14, c[0x0][0x30];
/*0088*/ MOV R22, c[0x0][0x38];
/*0090*/ IMAD.HI.X R13, R0, R14, c[0x0][0x34];
/*0098*/ IMUL R2, R22, 0x3;
/*00a0*/ IMAD.U32.U32 R8.CC, R21, R14, c[0x0][0x20];
/*00a8*/ LD.E R17, [R10];
/*00b0*/ IMAD.U32.U32.HI.X R9, R21, R14, c[0x0][0x24];
/*00b8*/ LD.E R20, [R12];
/*00c0*/ IMAD.U32.U32 R4.CC, R21, R14, c[0x0][0x28];
/*00c8*/ SHR.U32 R13, R22, 0x1;
/*00d0*/ IMAD.U32.U32.HI R16, R2, c[0x10][0x0], R0;
/*00d8*/ LD.E R23, [R8];
/*00e0*/ IMAD.U32.U32.HI.X R5, R21, R14, c[0x0][0x2c];
/*00e8*/ IADD R19, R0, R13;
/*00f0*/ IMAD.U32.U32 R2.CC, R16, R14, c[0x0][0x28];
/*00f8*/ LD.E R22, [R4];
/*0100*/ IMAD.U32.U32.HI.X R3, R16, R14, c[0x0][0x2c];
/*0108*/ IMAD.U32.U32 R6.CC, R16, R14, c[0x0][0x20];
/*0110*/ LD.E R2, [R2];
/*0118*/ IMAD.U32.U32.HI.X R7, R16, R14, c[0x0][0x24];
/*0120*/ MOV32I R3, 0xfffff800;
/*0128*/ IMAD.U32.U32 R10.CC, R21, R14, c[0x0][0x30];
/*0130*/ IMAD.U32.U32.HI.X R11, R21, R14, c[0x0][0x34];
/*0138*/ IMAD.U32.U32 R12.CC, R16, R14, c[0x0][0x30];
/*0140*/ IMAD.U32.U32.HI.X R13, R16, R14, c[0x0][0x34];
/*0148*/ LD.E R10, [R10];
/*0150*/ IMAD.U32.U32 R8.CC, R19, R14, c[0x0][0x28];
/*0158*/ LD.E R16, [R6];
/*0160*/ IMAD.U32.U32.HI.X R9, R19, R14, c[0x0][0x2c];
/*0168*/ LD.E R12, [R12];
/*0170*/ IMAD.U32.U32 R4.CC, R19, R14, c[0x0][0x30];
/*0178*/ LD.E R8, [R8];
/*0180*/ IMAD.U32.U32.HI.X R5, R19, R14, c[0x0][0x34];
/*0188*/ IMAD.U32.U32 R18.CC, R19, R14, c[0x0][0x20];
/*0190*/ LD.E R4, [R4];
/*0198*/ IMAD.U32.U32.HI.X R19, R19, R14, c[0x0][0x24];
/*01a0*/ LD.E R14, [R18];
/*01a8*/ IMAD R5, R15, R17, R20;
/*01b0*/ IMAD R9, R23, R22, R10;
/*01b8*/ IMAD R11, R14, R8, R4;
/*01c0*/ IMAD R13, R16, R2, R12;
/*01c8*/ IMAD R15, R5, R17, R20;
/*01d0*/ IMAD R21, R9, R22, R10;
/*01d8*/ IMAD R14, R11, R8, R4;
/*01e0*/ IMAD R16, R13, R2, R12;
/*01e8*/ IMAD R15, R15, R17, R20;
/*01f0*/ IMAD R21, R21, R22, R10;
/*01f8*/ IMAD R14, R14, R8, R4;
/*0200*/ IMAD R16, R16, R2, R12;
/*0208*/ IMAD R15, R15, R17, R20;
/*0210*/ IMAD R21, R21, R22, R10;
/*0218*/ IMAD R14, R14, R8, R4;
/*0220*/ IMAD R16, R16, R2, R12;
/*0228*/ IMAD R15, R15, R17, R20;
/*0230*/ IMAD R21, R21, R22, R10;
/*0238*/ IMAD R14, R14, R8, R4;
/*0240*/ IMAD R16, R16, R2, R12;
/*0248*/ IMAD R15, R15, R17, R20;
/*0250*/ IMAD R21, R21, R22, R10;
/*0258*/ IMAD R14, R14, R8, R4;
/*0260*/ IMAD R16, R16, R2, R12;
/*0268*/ IMAD R15, R15, R17, R20;
/*0270*/ IMAD R21, R21, R22, R10;
/*0278*/ IMAD R14, R14, R8, R4;
/*0280*/ IMAD R16, R16, R2, R12;
/*0288*/ IMAD R15, R15, R17, R20;
/*0290*/ IMAD R21, R21, R22, R10;
/*0298*/ IMAD R14, R14, R8, R4;
/*02a0*/ IMAD R16, R16, R2, R12;
/*02a8*/ IMAD R15, R15, R17, R20;
/*02b0*/ IMAD R21, R21, R22, R10;
/*02b8*/ IMAD R14, R14, R8, R4;
/*02c0*/ IMAD R16, R16, R2, R12;
/*02c8*/ IMAD R15, R15, R17, R20;
/*02d0*/ IMAD R21, R21, R22, R10;
/*02d8*/ IMAD R14, R14, R8, R4;
/*02e0*/ IMAD R16, R16, R2, R12;
/*02e8*/ IMAD R15, R15, R17, R20;
/*02f0*/ IMAD R21, R21, R22, R10;
/*02f8*/ IMAD R14, R14, R8, R4;
/*0300*/ IMAD R16, R16, R2, R12;
/*0308*/ IMAD R15, R15, R17, R20;
/*0310*/ IMAD R21, R21, R22, R10;
/*0318*/ IMAD R14, R14, R8, R4;
/*0320*/ IMAD R16, R16, R2, R12;
/*0328*/ IMAD R15, R15, R17, R20;
/*0330*/ IMAD R21, R21, R22, R10;
/*0338*/ IMAD R14, R14, R8, R4;
/*0340*/ IMAD R16, R16, R2, R12;
/*0348*/ IMAD R15, R15, R17, R20;
/*0350*/ IMAD R21, R21, R22, R10;
/*0358*/ IMAD R14, R14, R8, R4;
/*0360*/ IMAD R16, R16, R2, R12;
/*0368*/ IMAD R15, R15, R17, R20;
/*0370*/ IMAD R21, R21, R22, R10;
/*0378*/ IMAD R14, R14, R8, R4;
/*0380*/ IMAD R16, R16, R2, R12;
/*0388*/ IMAD R15, R15, R17, R20;
/*0390*/ IMAD R21, R21, R22, R10;
/*0398*/ IMAD R14, R14, R8, R4;
/*03a0*/ IMAD R16, R16, R2, R12;
/*03a8*/ IMAD R15, R15, R17, R20;
/*03b0*/ IMAD R21, R21, R22, R10;
/*03b8*/ IMAD R14, R14, R8, R4;
/*03c0*/ IMAD R16, R16, R2, R12;
/*03c8*/ IMAD R15, R15, R17, R20;
/*03d0*/ IMAD R21, R21, R22, R10;
/*03d8*/ IMAD R14, R14, R8, R4;
/*03e0*/ IMAD R16, R16, R2, R12;
/*03e8*/ IMAD R15, R15, R17, R20;
/*03f0*/ IMAD R21, R21, R22, R10;
/*03f8*/ IMAD R14, R14, R8, R4;
/*0400*/ IMAD R16, R16, R2, R12;
/*0408*/ IMAD R15, R15, R17, R20;
/*0410*/ IMAD R21, R21, R22, R10;
/*0418*/ IMAD R14, R14, R8, R4;
/*0420*/ IMAD R16, R16, R2, R12;
/*0428*/ IMAD R15, R15, R17, R20;
/*0430*/ IMAD R21, R21, R22, R10;
/*0438*/ IMAD R14, R14, R8, R4;
/*0440*/ IMAD R16, R16, R2, R12;
/*0448*/ IMAD R15, R15, R17, R20;
/*0450*/ IMAD R21, R21, R22, R10;
/*0458*/ IMAD R14, R14, R8, R4;
/*0460*/ IMAD R16, R16, R2, R12;
/*0468*/ IMAD R15, R15, R17, R20;
/*0470*/ IMAD R21, R21, R22, R10;
/*0478*/ IMAD R14, R14, R8, R4;
/*0480*/ IMAD R16, R16, R2, R12;
/*0488*/ IMAD R15, R15, R17, R20;
/*0490*/ IMAD R21, R21, R22, R10;
/*0498*/ IMAD R14, R14, R8, R4;
/*04a0*/ IMAD R16, R16, R2, R12;
/*04a8*/ IMAD R15, R15, R17, R20;
/*04b0*/ IMAD R21, R21, R22, R10;
/*04b8*/ IMAD R14, R14, R8, R4;
/*04c0*/ IMAD R16, R16, R2, R12;
/*04c8*/ IMAD R15, R15, R17, R20;
/*04d0*/ IMAD R21, R21, R22, R10;
/*04d8*/ IMAD R14, R14, R8, R4;
/*04e0*/ IMAD R16, R16, R2, R12;
/*04e8*/ IMAD R15, R15, R17, R20;
/*04f0*/ IMAD R21, R21, R22, R10;
/*04f8*/ IMAD R14, R14, R8, R4;
/*0500*/ IMAD R16, R16, R2, R12;
/*0508*/ IMAD R15, R15, R17, R20;
/*0510*/ IMAD R21, R21, R22, R10;
/*0518*/ IMAD R14, R14, R8, R4;
/*0520*/ IMAD R16, R16, R2, R12;
/*0528*/ IMAD R15, R15, R17, R20;
/*0530*/ IMAD R21, R21, R22, R10;
/*0538*/ IMAD R14, R14, R8, R4;
/*0540*/ IMAD R16, R16, R2, R12;
/*0548*/ IADD R3, R3, 0x20;
/*0550*/ IMAD R15, R15, R17, R20;
/*0558*/ IMAD R21, R21, R22, R10;
/*0560*/ IMAD R14, R14, R8, R4;
/*0568*/ IMAD R16, R16, R2, R12;
/*0570*/ ISETP.NE.AND P0, PT, R3, RZ, PT;
/*0578*/ IMAD R15, R15, R17, R20;
/*0580*/ IMAD R23, R21, R22, R10;
/*0588*/ IMAD R14, R14, R8, R4;
/*0590*/ IMAD R16, R16, R2, R12;
/*0598*/ IMAD R15, R15, R17, R20;
/*05a0*/ IMAD R23, R23, R22, R10;
/*05a8*/ IMAD R14, R14, R8, R4;
/*05b0*/ IMAD R16, R16, R2, R12;
/*05b8*/ @P0 BRA 0x1a8;
/*05c0*/ MOV32I R2, 0x40000000;
/*05c8*/ MOV32I R8, 0x4;
/*05d0*/ IMAD.U32.U32.HI R4, R2, c[0x0][0x38], R0;
/*05d8*/ IMAD R2.CC, R0, R8, c[0x0][0x20];
/*05e0*/ IMAD.HI.X R3, R0, R8, c[0x0][0x24];
/*05e8*/ IMAD.U32.U32 R14.CC, R4, R8, c[0x0][0x20];
/*05f0*/ IMAD.U32.U32.HI.X R15, R4, R8, c[0x0][0x24];
/*05f8*/ ST.E [R2], R5;
/*0600*/ ST.E [R14], R9;
/*0608*/ ST.E [R18], R11;
/*0610*/ ST.E [R6], R13;
/*0618*/ EXIT;
CUDA ST
和5.5
的{{1}}操作适用于6.0
,R10
,R18
和R17
由最后R8
个更新,而对于CUDA 6.5,它们在IMAD
,R5
,R9
和R11
上运行R13
的
我做错了什么?
我的设置:使用Windows 7的核心i7笔记本电脑,编译一个发布项目,IMAD
- 位或32
- 位(同样的问题)。以上是指64
- 位。
命令行:
64
修改
仅在存在两个额外内核时才会出现此问题。如果我发表评论,一切正常。工作和非工作案例的反汇编代码分别报告here和here。
答案 0 :(得分:1)
我已确认问题已在CUDA 7 EA中修复。当CUDA 7 RC或CUDA 7生产版本可用时,问题也应该在那里修复。