我对__shfl_down和__shfl_down_sync感到困惑,他们给出了不同的结果。
__global__ void shufledown1(double* a, double *b,double *c, int N)
{
double temp = 2.0;
__syncthreads();
for (int offset = 32/2; offset > 0; offset /= 2){
temp+=__shfl_down(temp, offset,32);
}
printf("%d %f %d \n",threadIdx.x ,temp,blockDim.x * gridDim.x);
}
__global__ void shufledown2(double* a, double *b,double *c, int N)
{
double temp = 2.0;
__syncthreads();
for (int offset = 32/2; offset > 0; offset /= 2){
temp+=__shfl_down_sync(temp, offset,32)
}
printf("%d %f %d \n",threadIdx.x ,temp,blockDim.x * gridDim.x);
}
第一个给出了:
0 64.000000 64
'''''
''''
''''
63 64.000000 64
第二个给出了:
0 33.000000 64
'''''
''''
''''
63 33.000000 64
内核使用1个块64个线程运行。 此致
答案 0 :(得分:8)
除了一个不同的名称之外,warp shuffle函数的_sync
版本也有不同的原型,如the documentation所示。第一个参数是掩码参数。
您似乎试图以相同的方式使用这两个功能:
temp+=__shfl_down(temp, offset,32);
temp+=__shfl_down_sync(temp, offset,32);
但这是不正确的。要以类似的方式使用_sync
版本,您应该这样做:
temp+=__shfl_down_sync(0xFFFFFFFF, temp, offset,32);
当我进行更改时,您的代码正确地为我运行:
#include <stdio.h>
__global__ void shufledown1(double* a, double *b,double *c, int N)
{
double temp = 2.0;
__syncthreads();
for (int offset = 32/2; offset > 0; offset /= 2){
temp+=__shfl_down(temp, offset,32);
}
printf("%d %f %d \n",threadIdx.x ,temp,blockDim.x * gridDim.x);
}
__global__ void shufledown2(double* a, double *b,double *c, int N)
{
double temp = 2.0;
__syncthreads();
for (int offset = 32/2; offset > 0; offset /= 2){
temp+=__shfl_down_sync(0xFFFFFFFF, temp, offset,32);
}
printf("%d %f %d \n",threadIdx.x ,temp,blockDim.x * gridDim.x);
}
int main(){
double *a = NULL, *b = NULL, *c = NULL;
shufledown1<<<1,64>>>(a, b, c, 0);
cudaDeviceSynchronize();
shufledown2<<<1,64>>>(a, b, c, 0);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_60 -o t1358 t1358.cu
t1358.cu(9): warning: function "__shfl_down(double, unsigned int, int)"
/usr/local/cuda/bin/..//include/sm_30_intrinsics.hpp(453): here was declared deprecated ("__shfl_down() is deprecated in favor of __shfl_down_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning).")
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 49; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 52; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 63; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 66; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 77; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 80; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 91; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 94; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 105; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
ptxas /tmp/tmpxft_000045b6_00000000-5_t1358.ptx, line 108; warning : Instruction 'shfl' without '.sync' is deprecated since PTX ISA version 6.0 and will be discontinued in a future PTX ISA version
$ ./t1358
0 64.000000 64
1 64.000000 64
2 64.000000 64
3 64.000000 64
4 64.000000 64
5 64.000000 64
6 64.000000 64
7 64.000000 64
8 64.000000 64
9 64.000000 64
10 64.000000 64
11 64.000000 64
12 64.000000 64
13 64.000000 64
14 64.000000 64
15 64.000000 64
16 64.000000 64
17 64.000000 64
18 64.000000 64
19 64.000000 64
20 64.000000 64
21 64.000000 64
22 64.000000 64
23 64.000000 64
24 64.000000 64
25 64.000000 64
26 64.000000 64
27 64.000000 64
28 64.000000 64
29 64.000000 64
30 64.000000 64
31 64.000000 64
32 64.000000 64
33 64.000000 64
34 64.000000 64
35 64.000000 64
36 64.000000 64
37 64.000000 64
38 64.000000 64
39 64.000000 64
40 64.000000 64
41 64.000000 64
42 64.000000 64
43 64.000000 64
44 64.000000 64
45 64.000000 64
46 64.000000 64
47 64.000000 64
48 64.000000 64
49 64.000000 64
50 64.000000 64
51 64.000000 64
52 64.000000 64
53 64.000000 64
54 64.000000 64
55 64.000000 64
56 64.000000 64
57 64.000000 64
58 64.000000 64
59 64.000000 64
60 64.000000 64
61 64.000000 64
62 64.000000 64
63 64.000000 64
0 64.000000 64
1 64.000000 64
2 64.000000 64
3 64.000000 64
4 64.000000 64
5 64.000000 64
6 64.000000 64
7 64.000000 64
8 64.000000 64
9 64.000000 64
10 64.000000 64
11 64.000000 64
12 64.000000 64
13 64.000000 64
14 64.000000 64
15 64.000000 64
16 64.000000 64
17 64.000000 64
18 64.000000 64
19 64.000000 64
20 64.000000 64
21 64.000000 64
22 64.000000 64
23 64.000000 64
24 64.000000 64
25 64.000000 64
26 64.000000 64
27 64.000000 64
28 64.000000 64
29 64.000000 64
30 64.000000 64
31 64.000000 64
32 64.000000 64
33 64.000000 64
34 64.000000 64
35 64.000000 64
36 64.000000 64
37 64.000000 64
38 64.000000 64
39 64.000000 64
40 64.000000 64
41 64.000000 64
42 64.000000 64
43 64.000000 64
44 64.000000 64
45 64.000000 64
46 64.000000 64
47 64.000000 64
48 64.000000 64
49 64.000000 64
50 64.000000 64
51 64.000000 64
52 64.000000 64
53 64.000000 64
54 64.000000 64
55 64.000000 64
56 64.000000 64
57 64.000000 64
58 64.000000 64
59 64.000000 64
60 64.000000 64
61 64.000000 64
62 64.000000 64
63 64.000000 64
$
对于新代码或新维护,您应该只使用_sync
版本。
有关mask参数用法的更多示例,请参阅此blog