我正在尝试在AMD GPU中使用GDS,但无法使其正常运行。我的GPU是AMD RX580。
我使用了这个OpenCL内核:
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void localVarExample(__global int *res)
{
int i = get_global_id(0);
__local int x;
if (i == 0) {
x = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
atom_inc(&x);
barrier(CLK_LOCAL_MEM_FENCE);
if (i == 0) {
*res = x;
}
}
构建并用(CLRX )进行拆卸。之后,我将gds标志添加到DS指令中,并修改了M0寄存器。
所以我得到了这个ASM内核:
/* Disassembling 'barrier-Ellesmere.bin' */
.amdcl2
.gpu Iceland
.64bit
.arch_minor 0
.arch_stepping 4
.driver_version 203603
.compile_options "-fno-bin-source -fno-bin-llvmir -fno-bin-amdil -fbin-exe -D__AMD__=1 -D__Ellesmere__=1 -D__Ellesmere=1 -D__IMAGE_SUPPORT__=1 -DFP_FAST_FMA=1 -cl-denorms-are-zero -m64 -Dcl_khr_fp64=1 -Dcl_amd_fp64=1 -Dcl_khr_global_int32_base_atomics=1 -Dcl_khr_global_int32_extended_atomics=1 -Dcl_khr_local_int32_base_atomics=1 -Dcl_khr_local_int32_extended_atomics=1 -Dcl_khr_int64_base_atomics=1 -Dcl_khr_int64_extended_atomics=1 -Dcl_khr_3d_image_writes=1 -Dcl_khr_byte_addressable_store=1 -Dcl_khr_fp16=1 -Dcl_khr_gl_sharing=1 -Dcl_khr_gl_depth_images=1 -Dcl_amd_device_attribute_query=1 -Dcl_amd_vec3=1 -Dcl_amd_printf=1 -Dcl_amd_media_ops=1 -Dcl_amd_media_ops2=1 -Dcl_amd_popcnt=1 -Dcl_khr_d3d10_sharing=1 -Dcl_khr_d3d11_sharing=1 -Dcl_khr_dx9_media_sharing=1 -Dcl_khr_image2d_from_buffer=1 -Dcl_khr_spir=1 -Dcl_khr_subgroups=1 -Dcl_khr_gl_event=1 -Dcl_khr_depth_images=1 -Dcl_khr_mipmap_image=1 -Dcl_khr_mipmap_image_writes=1 -Dcl_amd_liquid_flash=1 -Dcl_amd_planar_yuv=1"
.acl_version "AMD-COMP-LIB-v0.8 (0.0.SC_BUILD_NUMBER)"
.kernel localVarExample
.config
.dims x
.cws 64, 1, 1
.sgprsnum 13
.vgprsnum 3
.localsize 4
.floatmode 0xc0
.gdssize 0x1000
.pgmrsrc1 0x00ac0040
.pgmrsrc2 0x0000008c
.dx10clamp
.ieeemode
.useargs
.priority 0
.arg _.global_offset_0, "size_t", long
.arg _.global_offset_1, "size_t", long
.arg _.global_offset_2, "size_t", long
.arg _.printf_buffer, "size_t", void*, global, , rdonly
.arg _.vqueue_pointer, "size_t", long
.arg _.aqlwrap_pointer, "size_t", long
.arg res, "int*", int*, global,
.text
s_mov_b32 s7, 0x1000
s_mov_b32 m0, s7 # Setup M0 register with GDS size == 0x1000.
s_lshl_b32 s0, s6, 6 # #s0 = s6 * 64 (== group_id * local_size).
s_load_dwordx2 s[2:3], s[4:5], 0x0 # Load global_offset_0 into s[2:3].
s_waitcnt lgkmcnt(0)
s_add_u32 s0, s0, s2 # s0 += s2. Now s0 = group_id * local_size + global_offset_0.
v_add_u32 v0, vcc, s0, v0 # v0 += s0. Now v0 = get_global_id(0).
v_cmp_lg_i32 s[0:1], v0, 0 # |
s_mov_b64 s[2:3], exec # | if (get_global_id(0) == 0)
s_andn2_b64 exec, s[2:3], s[0:1] # |
v_mov_b32 v0, 0x0
s_cbranch_execz .L64_0
ds_write_b32 v0, v0 gds # write 0 to the first uint32 in GDS (only the first thread do this).
.L64_0:
s_mov_b64 exec, s[2:3]
v_mov_b32 v0, 0
v_mov_b32 v1, 1
s_waitcnt lgkmcnt(0)
ds_add_u32 v0, v1 gds # Atomic increment the first uint32 in GDS (v0 is address, v1 is value).
s_waitcnt lgkmcnt(0) # wait for ds_add_u32
s_andn2_b64 exec, s[2:3], s[0:1] # | if (get_global_id(0) == 0) reuse SGPR from previous if.
v_mov_b32 v0, 0
s_cbranch_execz .L140_0 # Finish all threads exept by the first one .
s_load_dwordx2 s[0:1], s[4:5], 0x30 # Load res argument into s[0:1]. s[0:1] == &res[0] now.
ds_read_b32 v0, v0 gds # Read from the first uint32 in GDS into v0.
s_waitcnt lgkmcnt(0) # Wait for ds_add_u32.
v_mov_b32 v1, s0
v_mov_b32 v2, s1
flat_store_dword v[1:2], v0 # Save v0 into res[0]
.L140_0:
s_endpgm
我在主机程序中用0填充res。
预期的结果是res[0] == 64
。带有LDS的OpenCL内核以这种方式工作。但是在GDS版本res[0] == 0
中。