为什么 std::vector<bool> 更快?

5
作为我实现“埃拉托色尼筛法”时,我遇到了使用 std::vector<bool> 的问题:无法访问原始数据。
因此,我决定使用自定义的极简实现,以便能够访问数据指针。
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H

#include <algorithm>
template <typename B>

class bits_t{

public:

    typedef B block_t;
    static const size_t block_size = sizeof(block_t) * 8;

    block_t* data;
    size_t size;
    size_t blocks;

    class bit_ref{
    public:
        block_t* const block;
        const block_t mask;

        bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}

        inline void operator=(bool v) const noexcept{
            if(v) *block |= mask;
            else  *block &= ~mask;
        }

        inline operator bool() const noexcept{
            return (bool)(*block & mask);
        }
    };



    bits_t() noexcept : data(nullptr){}

    void resize(const size_t n, const bool v) noexcept{
        block_t fill = v ? ~block_t(0) : block_t(0);
        size = n;
        blocks = (n + block_size - 1) / block_size;
        data = new block_t[blocks];
        std::fill(data, data + blocks, fill);
    }

    inline block_t& block_at_index(const size_t i) const noexcept{
        return data[i / block_size];
    }

    inline size_t index_in_block(const size_t i) const noexcept{
        return i % block_size;
    }

    inline bit_ref operator[](const size_t i) noexcept{
        return bit_ref(block_at_index(i), block_t(1) << index_in_block(i));
    }

    ~bits_t(){
        delete[] data;
    }

};

#endif // LIB_BITS_T_H

这段代码几乎与 /usr/include/c++/4.7/bits/stl_bvector.h 中的代码相同,但速度较慢。
我尝试过一种优化方法,
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H

#include <algorithm>
template <typename B>

class bits_t{

const B mask[64] = {
    0b0000000000000000000000000000000000000000000000000000000000000001,
    0b0000000000000000000000000000000000000000000000000000000000000010,
    0b0000000000000000000000000000000000000000000000000000000000000100,
    0b0000000000000000000000000000000000000000000000000000000000001000,
    0b0000000000000000000000000000000000000000000000000000000000010000,
    0b0000000000000000000000000000000000000000000000000000000000100000,
    0b0000000000000000000000000000000000000000000000000000000001000000,
    0b0000000000000000000000000000000000000000000000000000000010000000,
    0b0000000000000000000000000000000000000000000000000000000100000000,
    0b0000000000000000000000000000000000000000000000000000001000000000,
    0b0000000000000000000000000000000000000000000000000000010000000000,
    0b0000000000000000000000000000000000000000000000000000100000000000,
    0b0000000000000000000000000000000000000000000000000001000000000000,
    0b0000000000000000000000000000000000000000000000000010000000000000,
    0b0000000000000000000000000000000000000000000000000100000000000000,
    0b0000000000000000000000000000000000000000000000001000000000000000,
    0b0000000000000000000000000000000000000000000000010000000000000000,
    0b0000000000000000000000000000000000000000000000100000000000000000,
    0b0000000000000000000000000000000000000000000001000000000000000000,
    0b0000000000000000000000000000000000000000000010000000000000000000,
    0b0000000000000000000000000000000000000000000100000000000000000000,
    0b0000000000000000000000000000000000000000001000000000000000000000,
    0b0000000000000000000000000000000000000000010000000000000000000000,
    0b0000000000000000000000000000000000000000100000000000000000000000,
    0b0000000000000000000000000000000000000001000000000000000000000000,
    0b0000000000000000000000000000000000000010000000000000000000000000,
    0b0000000000000000000000000000000000000100000000000000000000000000,
    0b0000000000000000000000000000000000001000000000000000000000000000,
    0b0000000000000000000000000000000000010000000000000000000000000000,
    0b0000000000000000000000000000000000100000000000000000000000000000,
    0b0000000000000000000000000000000001000000000000000000000000000000,
    0b0000000000000000000000000000000010000000000000000000000000000000,
    0b0000000000000000000000000000000100000000000000000000000000000000,
    0b0000000000000000000000000000001000000000000000000000000000000000,
    0b0000000000000000000000000000010000000000000000000000000000000000,
    0b0000000000000000000000000000100000000000000000000000000000000000,
    0b0000000000000000000000000001000000000000000000000000000000000000,
    0b0000000000000000000000000010000000000000000000000000000000000000,
    0b0000000000000000000000000100000000000000000000000000000000000000,
    0b0000000000000000000000001000000000000000000000000000000000000000,
    0b0000000000000000000000010000000000000000000000000000000000000000,
    0b0000000000000000000000100000000000000000000000000000000000000000,
    0b0000000000000000000001000000000000000000000000000000000000000000,
    0b0000000000000000000010000000000000000000000000000000000000000000,
    0b0000000000000000000100000000000000000000000000000000000000000000,
    0b0000000000000000001000000000000000000000000000000000000000000000,
    0b0000000000000000010000000000000000000000000000000000000000000000,
    0b0000000000000000100000000000000000000000000000000000000000000000,
    0b0000000000000001000000000000000000000000000000000000000000000000,
    0b0000000000000010000000000000000000000000000000000000000000000000,
    0b0000000000000100000000000000000000000000000000000000000000000000,
    0b0000000000001000000000000000000000000000000000000000000000000000,
    0b0000000000010000000000000000000000000000000000000000000000000000,
    0b0000000000100000000000000000000000000000000000000000000000000000,
    0b0000000001000000000000000000000000000000000000000000000000000000,
    0b0000000010000000000000000000000000000000000000000000000000000000,
    0b0000000100000000000000000000000000000000000000000000000000000000,
    0b0000001000000000000000000000000000000000000000000000000000000000,
    0b0000010000000000000000000000000000000000000000000000000000000000,
    0b0000100000000000000000000000000000000000000000000000000000000000,
    0b0001000000000000000000000000000000000000000000000000000000000000,
    0b0010000000000000000000000000000000000000000000000000000000000000,
    0b0100000000000000000000000000000000000000000000000000000000000000,
    0b1000000000000000000000000000000000000000000000000000000000000000
};

public:

    typedef B block_t;
    static const size_t block_size = sizeof(block_t) * 8;

    block_t* data;
    size_t size;
    size_t blocks;

    class bit_ref{
    public:
        block_t* const block;
        const block_t mask;

        bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}

        inline void operator=(bool v) const noexcept{
            if(v) *block |= mask;
            else  *block &= ~mask;
        }

        inline operator bool() const noexcept{
            return (bool)(*block & mask);
        }
    };



    bits_t() noexcept : data(nullptr){}

    void resize(const size_t n, const bool v) noexcept{
        block_t fill = v ? ~block_t(0) : block_t(0);
        size = n;
        blocks = (n + block_size - 1) / block_size;
        data = new block_t[blocks];
        std::fill(data, data + blocks, fill);
    }

    inline block_t& block_at_index(const size_t i) const noexcept{
        return data[i / block_size];
    }

    inline size_t index_in_block(const size_t i) const noexcept{
        return i % block_size;
    }

    inline bit_ref operator[](const size_t i) noexcept{
        return bit_ref(block_at_index(i), mask[index_in_block(i)]);
    }

    ~bits_t(){
        delete[] data;
    }

};

#endif // LIB_BITS_T_H

(使用g++4.7 -O3进行编译)

Eratosthenes筛选算法(33.333.333位)

std::vector<bool> 19.1秒

bits_t<size_t> 19.9秒

bits_t<size_t>(带有查找表) 19.7秒

ctor + resize(33.333.333位) + dtor

std::vector<bool> 120毫秒

bits_t<size_t> 150毫秒

问题:减速来自哪里?


4
代码与 /usr/include/c++/4.7/bits/stl_bvector.h 中的代码几乎相同。 - Adriano Repetti
1
几乎意味着我没有发现任何不同之处。 - Mmmh mmh
2
这么小的差异可能是由于完全不相关的外部因素造成的,例如热循环如何与代码缓存行边界对齐。 - Gene
@Mike Seymour:你认为resize()为什么会泄漏一些东西? - DarkWanderer
@DarkWanderer:因为它将"data"重新分配到一个新分配的内存块中,而不删除旧的内存块。 - Mike Seymour
显示剩余7条评论
2个回答

3
除了其他用户指出的所有问题之外,您的调整大小在达到当前块限制时分配更多内存以添加一个块。std::vector将加倍缓冲区的大小(因此,如果您已经有16个块,现在您有32个块)。换句话说,他们会比你做更少的新事情。
话虽如此,您没有执行必要的删除和复制,这可能对您的版本产生“积极”的影响...(速度方面的“积极”影响,不是您不删除旧数据,也不是将其复制到新缓冲区中)。
此外,std::vector将正确扩大缓冲区,因此复制的数据很可能已经在CPU缓存中。使用您的版本时,该缓存会丢失,因为您只是忽略每次resize()时的旧缓冲区。
此外,当类处理内存缓冲区时,通常要实现复制和赋值运算符,出于某些原因...您还可以考虑使用shared_ptr<>()。然后,删除被隐藏,而类是一个模板,因此非常快(它不会添加任何您在自己的版本中没有的代码。)
=== 更新
还有一件事。您的operator[]实现:
inline bit_ref operator[](const size_t i) noexcept{
    return bit_ref(block_at_index(i), mask[index_in_block(i)]);
}

(附注:由于您已经在类中编写代码,因此内联不是必需的。)
您只提供了一个非const版本,它“很慢”,因为它创建了一个子类。您应该尝试实现一个返回bool值的const版本,并查看是否可以解决您看到的约3%的差异。
bool operator[](const size_t i) const noexcept
{
    return (block_at_index(i) & mask[index_in_block(i)]) != 0;
}

此外,使用mask[]数组也可能会减慢速度。而使用(1LL << (index & 0x3F))则更快(只需2个CPU指令且不需要访问内存)。


它只是被命名为resize,这样我就可以在向量和这个类之间切换...我真的只在整个执行过程中执行一次调整大小。shared_ptr使用多态性进行删除器,因此它并不完全相同。 - Mmmh mmh
对于resize()函数,它的使用方法并不清晰。但是它可能应该被替换为set_size()函数,并且该函数可以检查数据是否已经定义,以防万一抛出一个runtime_error异常。 - Alexis Wilke
我已经测试了你提出的所有建议,但显然带有 -O3 标志的编译器已经自己完成了这个任务。希望我已经找到了罪魁祸首。明确地内联 inline size_t index_in_block(const size_t i) const(即用 i%block_size 替换方法调用)将为 bits_t<unsigned long>std::vector<bool> 提供相同的运行时间,但我不确定为什么会这样。你有什么想法吗?如果你想测试它,我可以分享代码。 - Mmmh mmh
可能是使用inline时编译器会强制内联,而不使用时,即使使用-O3选项,它也可能保持函数分离。这在执行方面并没有太大的影响,因为它肯定都在指令缓存中。虽然两者都是模板,但我认为几乎所有内容都会自动内联。但我可能错了。默认情况下,被认为过于复杂的函数可能不会被内联。 - Alexis Wilke
你应该转储汇编代码,看看两种情况之间的确切差异。使用一些 Bash 指令,你可以转储出两种情况下的汇编代码并进行比较,很可能在短时间内就能得出答案,而不是瞎猜。 - Nir Friedman
显示剩余6条评论

0

显然,将i%block_size包装在一个函数中是罪魁祸首

inline size_t index_in_block ( const size_t i ) const noexcept {
    return i % block_size;
}

inline bit_ref operator[] ( const size_t i ) noexcept {
    return bit_ref( block_at_index( i ), block_t( 1 ) << index_in_block( i ) );
}

因此,将上述代码替换为

inline bit_ref operator[] ( const size_t i ) noexcept {
    return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}

问题已经解决了。但是我仍然不知道原因。我最好的猜测是我没有正确获取index_in_block的签名,因此优化器无法像手动内联方式那样内联此函数。

这是新代码。

#ifndef LIB_BITS_2_T_H
#define LIB_BITS_2_T_H

#include <algorithm>

template <typename B>

class bits_2_t {

public:

    typedef B block_t;
    static const int block_size = sizeof( block_t ) * __CHAR_BIT__;


private:

    block_t* _data;
    size_t _size;
    size_t _blocks;


public:

    class bit_ref {

    public:

        block_t* const block;
        const block_t mask;


        bit_ref ( block_t& block, const block_t mask) noexcept
        : block( &block ), mask( mask ) {}


        inline bool operator= ( const bool v ) const noexcept {

            if ( v ) *block |= mask;
            else     *block &= ~mask;

            return v;

        }

        inline operator bool() const noexcept {
            return (bool)( *block & mask );
        }


    };


    bits_2_t () noexcept : _data( nullptr ), _size( 0 ), _blocks( 0 ) {}

    bits_2_t ( const size_t n ) noexcept : _data( nullptr ), _size( n ) {

        _blocks = number_of_blocks_needed( n );
        _data = new block_t[_blocks];

        const block_t fill( 0 );
        std::fill( _data, _data + _blocks, fill );

    }

    bits_2_t ( const size_t n, const bool v ) noexcept : _data( nullptr ), _size( n ) {

        _blocks = number_of_blocks_needed( n );
        _data = new block_t[_blocks];

        const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
        std::fill( _data, _data + _blocks, fill );

    }

    void resize ( const size_t n ) noexcept {
        resize( n, false );
    }

    void resize ( const size_t n, const bool v ) noexcept {

        const size_t tmpblocks = number_of_blocks_needed( n );
        const size_t copysize = std::min( _blocks, tmpblocks );

        block_t* tmpdata = new block_t[tmpblocks];
        std::copy( _data, _data + copysize, tmpdata );

        const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
        std::fill( tmpdata + copysize, tmpdata + tmpblocks, fill );

        delete[] _data;

        _data = tmpdata;
        _blocks = tmpblocks;
        _size = n;

    }

    inline size_t number_of_blocks_needed ( const size_t n ) const noexcept {
        return ( n + block_size - 1 ) / block_size;
    }

    inline block_t& block_at_index ( const size_t i ) const noexcept {
        return _data[i / block_size];
    }

    inline bit_ref operator[] ( const size_t i ) noexcept {
        return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
    }

    inline bool operator[] ( const size_t i ) const noexcept {
        return (bool)( block_at_index( i ) & ( block_t( 1 ) << ( i % block_size ) ) );
    }

    inline block_t* data () {
        return _data;
    }

    inline const block_t* data () const {
        return _data;
    }

    inline size_t size () const {
        return _size;
    }

    void clear () noexcept {

        delete[] _data;

        _size = 0;
        _blocks = 0;
        _data = nullptr;

    }

    ~bits_2_t () {
        clear();
    }


};

#endif // LIB_BITS_2_T_H

这是在我的amd64机器上针对素数的新代码的结果,最高达1.000.000.000(3次运行中的最佳实际时间)。

埃拉托斯特尼筛法,每个数字使用1个内存单元(不跳过2的倍数)。

bits_t<uint8_t>

实际时间0m23.614秒,用户时间0m23.493秒,系统时间0m0.092秒

bits_t<uint16_t>

实际时间0m24.399秒,用户时间0m24.294秒,系统时间0m0.084秒

bits_t<uint32_t>

实际时间0m23.501秒,用户时间0m23.372秒,系统时间0m0.108秒 <-- 最佳

bits_t<uint64_t>

实际时间0m24.393秒,用户时间0m24.304秒,系统时间0m0.068秒

std::vector<bool>

真实时间 0m24.362秒 用户时间 0m24.276秒 系统时间 0m0.056秒

std::vector<uint8_t>

真实时间 0m38.303秒 用户时间 0m37.570秒 系统时间 0m0.683秒

这是筛法的代码(其中(...)应替换为您选择的位数组)。

#include <iostream>

typedef (...) array_t;

int main ( int argc, char const *argv[] ) {

    if ( argc != 2 ) {
        std::cout << "#0 missing" << std::endl;
        return 1;
    }

    const size_t count = std::stoull( argv[1] );
    array_t prime( count, true );
    prime[0] = prime[1] = false;


    for ( size_t k = 2 ; k * k < count ; ++k ) {

        if ( prime[k] ) {

            for ( size_t i = k * k ; i < count ; i += k ) {
                prime[i] = false;
            }

        }

    }

    return 0;
}

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接