CUDA:无符号字符的原子操作

5
我是一个CUDA初学者。我有一个无符号字符的像素缓冲区存在全局内存中,可以被任何线程更新。为了避免像素值出现奇怪的情况,因此当一个线程尝试更新一个像素时,我想执行一个atomicExch操作。但编程指南说这个函数只能在32位或64位字上工作,而我只想原子地交换一个8位字节。有没有办法做到这一点?
谢谢。

更新是什么?如果你想将一个数字翻转为0或1,你可以使用atomicAnd/Or。 - Anycorn
我正在根据“驻留”在该像素中的对象数量增加像素值。因此,如果我的增量为每个对象50,则一个对象将导致具有RGBA(50,50,50,50)的像素,而两个对象将具有RGBA(100,100,100,100)等,最高达(255,255,255,255)。这使我能够根据“内部”该像素的对象数量来改变像素的强度。 - Andrew
顺便说一下,我找到了一个技巧。由于uchar4占用与int相同的空间(虽然不能保证,但在我的架构上可以工作),我只需取uchar4的地址,将其强制转换为(int *),然后使用整数版本的atomicExch即可。不过,我仍然很想知道是否可以仅对单个字节进行原子操作... - Andrew
3个回答

3
我最近遇到了这个问题。理论上,原子操作/乐观重试应该比锁/互斥更快,因此在其他数据类型上使用原子操作的“hack”解决方案似乎比使用关键部分更好。
以下是基于线程的实现,用于如何实现char的atomicMinshort的atomicAdd
我已经测试过所有这些,我的测试似乎表明它们目前工作良好。
第一版char的atomicAdd:
__device__ static inline char atomicAdd(char* address, char val) {
    // offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
    size_t long_address_modulo = (size_t) address & 3;
    // the 32-bit address that overlaps the same memory
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    // A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
    // The "4" signifies the position where the first byte of the second argument will end up in the output.
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
    // for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
    unsigned int selector = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;

    long_old = *base_address;

    do {
        long_assumed = long_old;
        // replace bits in long_old that pertain to the char address with those from val
        long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
        replacement = __byte_perm(long_old, long_val, selector);
        long_old = atomicCAS(base_address, long_assumed, replacement);
    } while (long_old != long_assumed);
    return __byte_perm(long_old, 0, long_address_modulo);
}

char类型的atomicCAS

__device__ static inline char atomicCAS(char* address, char expected, char desired) {
    size_t long_address_modulo = (size_t) address & 3;
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};

    unsigned int sel = selectors[long_address_modulo];
    unsigned int long_old, long_assumed, long_val, replacement;
    char old;

    long_val = (unsigned int) desired;
    long_old = *base_address;
    do {
        long_assumed = long_old;
        replacement = __byte_perm(long_old, long_val, sel);
        long_old = atomicCAS(base_address, long_assumed, replacement);
        old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
    } while (expected == old && long_assumed != long_old);

    return old;
}

针对char类型的atomicAdd的第二个版本(使用位移代替__byte_perm,因此必须处理溢出)

__device__ static inline char atomicAdd2(char* address, char val) {
    size_t long_address_modulo = (size_t) address & 3;
    auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
    unsigned int long_val = (unsigned int) val << (8 * long_address_modulo);
    unsigned int long_old = atomicAdd(base_address, long_val);

    if (long_address_modulo == 3) {
        // the first 8 bits of long_val represent the char value,
        // hence the first 8 bits of long_old represent its previous value.
        return (char) (long_old >> 24);
    } else {
        // bits that represent the char value within long_val
        unsigned int mask = 0x000000ff << (8 * long_address_modulo);
        unsigned int masked_old = long_old & mask;
        // isolate the bits that represent the char value within long_old, add the long_val to that,
        // then re-isolate by excluding bits that represent the char value
        unsigned int overflow = (masked_old + long_val) & ~mask;
        if (overflow) {
            atomicSub(base_address, overflow);
        }
        return (char) (masked_old >> 8 * long_address_modulo);
    }
}

关于atomicMin,请查看该帖子


1

0

其他答案atomicCAS()的实现中存在一个错误。这个版本对我来说是有效的:

__device__
static inline
uint8_t
atomicCAS( uint8_t * const address,
           uint8_t   const compare,
           uint8_t   const value )
{
    // Determine where in a byte-aligned 32-bit range our address of 8 bits occurs.
    uint8_t    const     longAddressModulo = reinterpret_cast< size_t >( address ) & 0x3;
    // Determine the base address of the byte-aligned 32-bit range that contains our address of 8 bits.
    uint32_t * const     baseAddress       = reinterpret_cast< uint32_t * >( address - longAddressModulo );
    uint32_t   constexpr byteSelection[]   = { 0x3214, 0x3240, 0x3410, 0x4210 }; // The byte position we work on is '4'.
    uint32_t   const     byteSelector      = byteSelection[ longAddressModulo ];
    uint32_t   const     longCompare       = compare;
    uint32_t   const     longValue         = value;
    uint32_t             longOldValue      = * baseAddress;
    uint32_t             longAssumed;
    uint8_t              oldValue;

    do
    {
        // Select bytes from the old value and new value to construct a 32-bit value to use.
        uint32_t const replacement = __byte_perm( longOldValue, longValue,   byteSelector );
        uint32_t const comparison  = __byte_perm( longOldValue, longCompare, byteSelector );

        longAssumed  = longOldValue;
        // Use 32-bit atomicCAS() to try and set the 8-bits we care about.
        longOldValue = ::atomicCAS( baseAddress, comparison, replacement );
        // Grab the 8-bit portion we care about from the old value at address.
        oldValue     = ( longOldValue >> ( 8 * longAddressModulo )) & 0xFF;
    }
    while ( compare == oldValue and longAssumed != longOldValue ); // Repeat until other three 8-bit values stabilize.

    return oldValue;
}

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接