我是CUDA的初学者。我在全局内存中有一个无符号字符的像素缓冲区,可以由任何和所有线程更新。因此,为了避免像素值中的怪异,我想在线程尝试更新一个时执行atomicExch。但编程指南说这个函数只适用于32位或64位字,而我只想原子地交换一个8位字节。有没有办法做到这一点?
感谢。
答案 0 :(得分:1)
您可以使用互斥变量实现临界区。 像
这样的东西get_the_lock
exch_data
release
答案 1 :(得分:1)
我最近遇到了这个问题。从理论上讲,原子操作/乐观重试应该比锁/互斥对象快,因此对其他数据类型使用原子操作的“ hack”解决方案在我看来比使用关键部分要好。
以下是基于how to implement atomicMin for char和atomicAdd for short的线程的一些实现。
我已经测试了所有这些,而且测试似乎表明它们到目前为止可以正常工作。
atomicAdd的版本1用于字符
__device__ static inline char atomicAdd(char* address, char val) {
// offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
size_t long_address_modulo = (size_t) address & 3;
// the 32-bit address that overlaps the same memory
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
// A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
// The "4" signifies the position where the first byte of the second argument will end up in the output.
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
// for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
unsigned int selector = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;
long_old = *base_address;
do {
long_assumed = long_old;
// replace bits in long_old that pertain to the char address with those from val
long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
replacement = __byte_perm(long_old, long_val, selector);
long_old = atomicCAS(base_address, long_assumed, replacement);
} while (long_old != long_assumed);
return __byte_perm(long_old, 0, long_address_modulo);
}
atomicCAS for char
__device__ static inline char atomicCAS(char* address, char expected, char desired) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
unsigned int sel = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;
char old;
long_val = (unsigned int) desired;
long_old = *base_address;
do {
long_assumed = long_old;
replacement = __byte_perm(long_old, long_val, sel);
long_old = atomicCAS(base_address, long_assumed, replacement);
old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
} while (expected == old && long_assumed != long_old);
return old;
}
atomicAdd的版本2(用于char(使用位移而不是__byte_perm,并且必须处理溢出))
__device__ static inline char atomicAdd2(char* address, char val) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int long_val = (unsigned int) val << (8 * long_address_modulo);
unsigned int long_old = atomicAdd(base_address, long_val);
if (long_address_modulo == 3) {
// the first 8 bits of long_val represent the char value,
// hence the first 8 bits of long_old represent its previous value.
return (char) (long_old >> 24);
} else {
// bits that represent the char value within long_val
unsigned int mask = 0x000000ff << (8 * long_address_modulo);
unsigned int masked_old = long_old & mask;
// isolate the bits that represent the char value within long_old, add the long_val to that,
// then re-isolate by excluding bits that represent the char value
unsigned int overflow = (masked_old + long_val) & ~mask;
if (overflow) {
atomicSub(base_address, overflow);
}
return (char) (masked_old >> 8 * long_address_modulo);
}
}
对于atomicMin,请检查this thread。
答案 2 :(得分:0)
other answer 在 atomicCAS()
的实现中有一个错误。这个版本对我有用:
__device__
static inline
uint8_t
atomicCAS( uint8_t * const address,
uint8_t const compare,
uint8_t const value )
{
// Determine where in a byte-aligned 32-bit range our address of 8 bits occurs.
uint8_t const longAddressModulo = reinterpret_cast< size_t >( address ) & 0x3;
// Determine the base address of the byte-aligned 32-bit range that contains our address of 8 bits.
uint32_t * const baseAddress = reinterpret_cast< uint32_t * >( address - longAddressModulo );
uint32_t constexpr byteSelection[] = { 0x3214, 0x3240, 0x3410, 0x4210 }; // The byte position we work on is '4'.
uint32_t const byteSelector = byteSelection[ longAddressModulo ];
uint32_t const longCompare = compare;
uint32_t const longValue = value;
uint32_t longOldValue = * baseAddress;
uint32_t longAssumed;
uint8_t oldValue;
do
{
// Select bytes from the old value and new value to construct a 32-bit value to use.
uint32_t const replacement = __byte_perm( longOldValue, longValue, byteSelector );
uint32_t const comparison = __byte_perm( longOldValue, longCompare, byteSelector );
longAssumed = longOldValue;
// Use 32-bit atomicCAS() to try and set the 8-bits we care about.
longOldValue = ::atomicCAS( baseAddress, comparison, replacement );
// Grab the 8-bit portion we care about from the old value at address.
oldValue = ( longOldValue >> ( 8 * longAddressModulo )) & 0xFF;
}
while ( compare == oldValue and longAssumed != longOldValue ); // Repeat until other three 8-bit values stabilize.
return oldValue;
}