std::vector比普通数组慢很多吗？

Question

std::vector比普通数组慢很多吗？

246

我一直以为std::vector是“作为一个数组实现”的普遍认识，但今天我进行了测试，发现事实并非如此：

下面是一些测试结果：

UseArray completed in 2.619 seconds
UseVector completed in 9.284 seconds
UseVectorPushBack completed in 14.669 seconds
The whole thing completed in 26.591 seconds

这大约慢了 3 到 4 倍！并不能为“vector 可能比几个纳秒慢”这样的评论辩护。

我使用的代码是：

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
{
    public:
        TestTimer(const std::string & name) : name(name),
            start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
        {
        }

        ~TestTimer()
        {
            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;
        }

    private:
        std::string name;
        boost::posix_time::ptime start;
};

struct Pixel
{
    Pixel()
    {
    }

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
    {
    }

    unsigned char r, g, b;
};

void UseVector()
{
    TestTimer t("UseVector");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

void UseVectorPushBack()
{
    TestTimer t("UseVectorPushBack");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels;
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));
    }
}

void UseArray()
{
    TestTimer t("UseArray");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }

        free(pixels);
    }
}

int main()
{
    TestTimer t1("The whole thing");

    UseArray();
    UseVector();
    UseVectorPushBack();

    return 0;
}

我是做错了什么吗？还是我打破了这个性能神话？

我正在使用Visual Studio 2005的发布模式。

在Visual C++中，#define _SECURE_SCL 0可以将UseVector的运行时间减少一半（从8秒降至4秒）。在我看来，这真的非常巨大。

- kizzx2

26

在调试模式下，某些向量版本会添加额外的指令来检查您是否超出数组末尾访问等情况。为了获得真正的时间统计，您必须在发布模式下构建并打开优化选项。 - Martin York

51

你测量而不是相信网上听到的声明是很好的做法。 - P Shved

53

"vector"作为一个数组实现，这不仅是"常识"，而是事实。你已经发现了vector是一个通用的可调整大小的数组。恭喜！与所有通用工具一样，可能会出现特殊情况下它并不是最优选择的情况。因此，常规智慧是以vector为起点，必要时再考虑其他替代方案。 - Dennis Zickefoose

37

笑，把脏碗扔进水槽和把脏碗扔进水槽并检查它们是否破裂的速度差别有多大？ - Imre L

9

在VC2010中，至少看起来malloc()比resize()更快。从计时中删除内存分配，使用_ITERATOR_DEBUG_LEVEL == 0编译，结果是相同的。 - Andreas Magnusson

显示剩余16条评论

22个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- KeyC0de · Answer 1

我做了一些长期以来想做的广泛测试。也许可以分享一下。

这是我的双启动机器，i7-3770，16GB RAM，x86_64，在Windows 8.1和Ubuntu 16.04上运行。更多信息和结论，请见下文。测试了MSVS 2017和g ++（在Windows和Linux上均测试）。

测试程序

#include <iostream>
#include <chrono>
//#include <algorithm>
#include <array>
#include <locale>
#include <vector>
#include <queue>
#include <deque>

// Note: total size of array must not exceed 0x7fffffff B = 2,147,483,647B
//  which means that largest int array size is 536,870,911
// Also image size cannot be larger than 80,000,000B
constexpr int long g_size = 100000;
int g_A[g_size];


int main()
{
    std::locale loc("");
    std::cout.imbue(loc);
    constexpr int long size = 100000;  // largest array stack size

    // stack allocated c array
    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
    int A[size];
    for (int i = 0; i < size; i++)
        A[i] = i;

    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "c-style stack array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "c-style stack array size=" << sizeof(A) << "B\n\n";

    // global stack c array
    start = std::chrono::steady_clock::now();
    for (int i = 0; i < g_size; i++)
        g_A[i] = i;

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "global c-style stack array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "global c-style stack array size=" << sizeof(g_A) << "B\n\n";

    // raw c array heap array
    start = std::chrono::steady_clock::now();
    int* AA = new int[size];    // bad_alloc() if it goes higher than 1,000,000,000
    for (int i = 0; i < size; i++)
        AA[i] = i;

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "c-style heap array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "c-style heap array size=" << sizeof(AA) << "B\n\n";
    delete[] AA;

    // std::array<>
    start = std::chrono::steady_clock::now();
    std::array<int, size> AAA;
    for (int i = 0; i < size; i++)
        AAA[i] = i;
    //std::sort(AAA.begin(), AAA.end());

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::array duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::array size=" << sizeof(AAA) << "B\n\n";

    // std::vector<>
    start = std::chrono::steady_clock::now();
    std::vector<int> v;
    for (int i = 0; i < size; i++)
        v.push_back(i);
    //std::sort(v.begin(), v.end());

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::vector duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::vector size=" << v.size() * sizeof(v.back()) << "B\n\n";

    // std::deque<>
    start = std::chrono::steady_clock::now();
    std::deque<int> dq;
    for (int i = 0; i < size; i++)
        dq.push_back(i);
    //std::sort(dq.begin(), dq.end());

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::deque duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::deque size=" << dq.size() * sizeof(dq.back()) << "B\n\n";

    // std::queue<>
    start = std::chrono::steady_clock::now();
    std::queue<int> q;
    for (int i = 0; i < size; i++)
        q.push(i);

    duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "std::queue duration=" << duration / 1000.0 << "ms\n";
    std::cout << "std::queue size=" << q.size() * sizeof(q.front()) << "B\n\n";
}

结果

//////////////////////////////////////////////////////////////////////////////////////////
// with MSVS 2017:
// >> cl /std:c++14 /Wall -O2 array_bench.cpp
//
// c-style stack array duration=0.15ms
// c-style stack array size=400,000B
//
// global c-style stack array duration=0.130ms
// global c-style stack array size=400,000B
//
// c-style heap array duration=0.90ms
// c-style heap array size=4B
//
// std::array duration=0.20ms
// std::array size=400,000B
//
// std::vector duration=0.544ms
// std::vector size=400,000B
//
// std::deque duration=1.375ms
// std::deque size=400,000B
//
// std::queue duration=1.491ms
// std::queue size=400,000B
//
//////////////////////////////////////////////////////////////////////////////////////////
//
// with g++ version:
//      - (tdm64-1) 5.1.0 on Windows
//      - (Ubuntu 5.4.0-6ubuntu1~16.04.10) 5.4.0 20160609 on Ubuntu 16.04
// >> g++ -std=c++14 -Wall -march=native -O2 array_bench.cpp -o array_bench
//
// c-style stack array duration=0ms
// c-style stack array size=400,000B
//
// global c-style stack array duration=0.124ms
// global c-style stack array size=400,000B
//
// c-style heap array duration=0.648ms
// c-style heap array size=8B
//
// std::array duration=1ms
// std::array size=400,000B
//
// std::vector duration=0.402ms
// std::vector size=400,000B
//
// std::deque duration=0.234ms
// std::deque size=400,000B
//
// std::queue duration=0.304ms
// std::queue size=400,000
//
//////////////////////////////////////////////////////////////////////////////////////////

注释

平均运行次数为10次。
我最初也使用了std::sort()进行测试（你可以看到它被注释掉了），但后来将其删除，因为没有显著的相对差异。

我的结论和备注

请注意，全局C风格数组所需的时间几乎与堆C风格数组相同。
在所有测试中，我注意到std::array的时间变化在连续运行之间具有显着的稳定性，而其他数据结构（特别是std::）则相对变化很大。
O3优化没有显示出任何值得注意的时间差异。
在Windows cl（无-O2）和g++（Win / Linux无-O2，无-march = native）上取消优化会显着增加时间。特别是对于std：data structs。总体而言，MSVS比g ++的时间更长，但在Windows上，std::array和c-style数组在没有优化的情况下更快。
g ++生成的代码比Microsoft的编译器更快（显然即使在Windows上运行也更快）。

判决

当然，这是优化构建的代码。既然问题涉及到std::vector，那么它比普通数组（优化/未优化）要慢得多。但是在进行基准测试时，您自然希望生成优化的代码。

对我来说，主角是std::array。

- Michał Szczepaniak · Answer 2

更好的基准测试（我认为...），编译器由于优化可以更改代码，因为分配的向量/数组的结果没有在任何地方使用。

$ g++ test.cpp -o test -O3 -march=native
$ ./test 
UseArray inner completed in 0.652 seconds
UseArray completed in 0.773 seconds
UseVector inner completed in 0.638 seconds
UseVector completed in 0.757 seconds
UseVectorPushBack inner completed in 6.732 seconds
UseVectorPush completed in 6.856 seconds
The whole thing completed in 8.387 seconds

编译器：

gcc version 6.2.0 20161019 (Debian 6.2.0-9)

CPU：

model name  : Intel(R) Core(TM) i7-3630QM CPU @ 2.40GHz

而且代码：

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
{
    public:
        TestTimer(const std::string & name) : name(name),
            start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
        {
        }

        ~TestTimer()
        {
            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;
        }

    private:
        std::string name;
        boost::posix_time::ptime start;
};

struct Pixel
{
    Pixel()
    {
    }

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
    {
    }

    unsigned char r, g, b;
};

void UseVector(std::vector<std::vector<Pixel> >& results)
{
    TestTimer t("UseVector inner");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel>& pixels = results.at(i);
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}

void UseVectorPushBack(std::vector<std::vector<Pixel> >& results)
{
    TestTimer t("UseVectorPushBack inner");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel>& pixels = results.at(i);
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));
    }
}

void UseArray(Pixel** results)
{
    TestTimer t("UseArray inner");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        results[i] = pixels;

        for(int i = 0 ; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }

        // free(pixels);
    }
}

void UseArray()
{
    TestTimer t("UseArray");
    Pixel** array = (Pixel**)malloc(sizeof(Pixel*)* 1000);
    UseArray(array);
    for(int i=0;i<1000;++i)
        free(array[i]);
    free(array);
}

void UseVector()
{
    TestTimer t("UseVector");
    {
        std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());
        UseVector(vector);
    }
}

void UseVectorPushBack()
{
    TestTimer t("UseVectorPush");
    {
        std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());
        UseVectorPushBack(vector);
    }
}


int main()
{
    TestTimer t1("The whole thing");

    UseArray();
    UseVector();
    UseVectorPushBack();

    return 0;
}

- Anycorn · Answer 3

一些分析器数据（像素对齐到32位）：

g++ -msse3 -O3 -ftree-vectorize -g test.cpp -DNDEBUG && ./a.out
UseVector completed in 3.123 seconds
UseArray completed in 1.847 seconds
UseVectorPushBack completed in 9.186 seconds
The whole thing completed in 14.159 seconds

无聊

andrey@nv:~$ opannotate --source libcchem/src/a.out  | grep "Total samples for file" -A3
Overflow stats not available
 * Total samples for file : "/usr/include/c++/4.4/ext/new_allocator.h"
 *
 * 141008 52.5367
 */
--
 * Total samples for file : "/home/andrey/libcchem/src/test.cpp"
 *
 *  61556 22.9345
 */
--
 * Total samples for file : "/usr/include/c++/4.4/bits/stl_vector.h"
 *
 *  41956 15.6320
 */
--
 * Total samples for file : "/usr/include/c++/4.4/bits/stl_uninitialized.h"
 *
 *  20956  7.8078
 */
--
 * Total samples for file : "/usr/include/c++/4.4/bits/stl_construct.h"
 *
 *   2923  1.0891
 */

在allocator中：

               :      // _GLIBCXX_RESOLVE_LIB_DEFECTS
               :      // 402. wrong new expression in [some_] allocator::construct
               :      void
               :      construct(pointer __p, const _Tp& __val)
141008 52.5367 :      { ::new((void *)__p) _Tp(__val); }

向量:

               :void UseVector()
               :{ /* UseVector() total:  60121 22.3999 */
...
               :
               :
 10790  4.0201 :        for (int i = 0; i < dimension * dimension; ++i) {
               :
   495  0.1844 :            pixels[i].r = 255;
               :
 12618  4.7012 :            pixels[i].g = 0;
               :
  2253  0.8394 :            pixels[i].b = 0;
               :
               :        }

数组

               :void UseArray()
               :{ /* UseArray() total:  35191 13.1114 */
               :
...
               :
   136  0.0507 :        for (int i = 0; i < dimension * dimension; ++i) {
               :
  9897  3.6874 :            pixels[i].r = 255;
               :
  3511  1.3081 :            pixels[i].g = 0;
               :
 21647  8.0652 :            pixels[i].b = 0;

大部分的开销在于复制构造函数。例如，

    std::vector < Pixel > pixels;//(dimension * dimension, Pixel());

    pixels.reserve(dimension * dimension);

    for (int i = 0; i < dimension * dimension; ++i) {

        pixels[i].r = 255;

        pixels[i].g = 0;

        pixels[i].b = 0;
    }

它具有与数组相同的性能。

- wheaties · Answer 4

这里是vector中push_back方法的工作原理：

向量在初始化时分配X数量的空间。
如下所述，它检查当前基础数组中是否有该项的空间。
它复制了push_back调用中的项。

调用push_back X个项目后：

向量将kX数量的空间重新分配到第二个数组中。
将第一个数组的条目复制到第二个数组中。
丢弃第一个数组。
现在使用第二个数组作为存储，直到达到kX条目。

重复。如果您没有保留空间，那么速度肯定会变慢。更重要的是，如果复制该项很昂贵，那么像这样的'push_back'将使您疲于应对。

至于vector与数组之间的问题，我必须同意其他人的观点。在发布中运行，打开优化，并添加一些标志，以便Microsoft友好的人不会为您搞砸它。

还有一件事，如果您不需要调整大小，请使用Boost.Array。

- StereoMatching · Answer 5

我的笔记本是联想G770（4GB RAM）。

操作系统为Windows 7 64位（与笔记本一起提供的那个）

编译器是MinGW 4.6.1。

集成开发环境是Code::Blocks。

我测试了第一个帖子的源代码。

结果

O2优化

UseArray在2.841秒内完成

UseVector在2.548秒内完成

UseVectorPushBack在11.95秒内完成

整个过程在17.342秒内完成

系统暂停

O3优化

UseArray在1.452秒内完成

UseVector在2.514秒内完成

UseVectorPushBack在12.967秒内完成

整个过程在16.937秒内完成

看起来，在O3优化下，向量的性能比较差。

如果您将循环更改为

    pixels[i].r = i;
    pixels[i].g = i;
    pixels[i].b = i;

在O2和O3下，数组和向量的速度几乎相同。

- user2189731 · Answer 6

我必须说我不是C ++的专家。但是为了添加一些实验结果：

编译： gcc-6.2.0 / bin / g ++ -O3 -std = c ++ 14 vector.cpp

机器：

Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz

OS:

2.6.32-642.13.1.el6.x86_64

输出：

UseArray completed in 0.167821 seconds
UseVector completed in 0.134402 seconds
UseConstructor completed in 0.134806 seconds
UseFillConstructor completed in 1.00279 seconds
UseVectorPushBack completed in 6.6887 seconds
The whole thing completed in 8.12888 seconds

这里唯一让我感到奇怪的是，“UseFillConstructor”相对于“UseConstructor”的性能。

代码：

void UseConstructor()
{
    TestTimer t("UseConstructor");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels(dimension*dimension);
        for(int i = 0; i < dimension * dimension; ++i)
        {
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;
        }
    }
}


void UseFillConstructor()
{
    TestTimer t("UseFillConstructor");

    for(int i = 0; i < 1000; ++i)
    {
        int dimension = 999;

        std::vector<Pixel> pixels(dimension*dimension, Pixel(255,0,0));
    }
}

因此，提供的额外“价值”会大大降低性能，我认为这是由于多次调用复制构造函数造成的。但是...

编译：

gcc-6.2.0/bin/g++ -std=c++14 -O vector.cpp

输出：

UseArray completed in 1.02464 seconds
UseVector completed in 1.31056 seconds
UseConstructor completed in 1.47413 seconds
UseFillConstructor completed in 1.01555 seconds
UseVectorPushBack completed in 6.9597 seconds
The whole thing completed in 11.7851 seconds

所以在这种情况下，gcc优化非常重要，但当一个值作为默认值提供时，它并不能帮助你太多。实际上，这与我的教学相悖。希望这能帮助新程序员选择向量初始化格式。

- Charles Chow · Answer 7

我想提一下，vector（和smart_ptr）只是在原始数组（和原始指针）的基础上添加了一个薄层。实际上，在连续内存中访问vector的访问时间比数组更快。以下代码显示了初始化和访问vector和数组的结果。

#include <boost/date_time/posix_time/posix_time.hpp>
#include <iostream>
#include <vector>
#define SIZE 20000
int main() {
    srand (time(NULL));
    vector<vector<int>> vector2d;
    vector2d.reserve(SIZE);
    int index(0);
    boost::posix_time::ptime start_total = boost::posix_time::microsec_clock::local_time();
    //  timer start - build + access
    for (int i = 0; i < SIZE; i++) {
        vector2d.push_back(vector<int>(SIZE));
    }
    boost::posix_time::ptime start_access = boost::posix_time::microsec_clock::local_time();
    //  timer start - access
    for (int i = 0; i < SIZE; i++) {
        index = rand()%SIZE;
        for (int j = 0; j < SIZE; j++) {

            vector2d[index][index]++;
        }
    }
    boost::posix_time::ptime end = boost::posix_time::microsec_clock::local_time();
    boost::posix_time::time_duration msdiff = end - start_total;
    cout << "Vector total time: " << msdiff.total_milliseconds() << "milliseconds.\n";
    msdiff = end - start_acess;
    cout << "Vector access time: " << msdiff.total_milliseconds() << "milliseconds.\n"; 


    int index(0);
    int** raw2d = nullptr;
    raw2d = new int*[SIZE];
    start_total = boost::posix_time::microsec_clock::local_time();
    //  timer start - build + access
    for (int i = 0; i < SIZE; i++) {
        raw2d[i] = new int[SIZE];
    }
    start_access = boost::posix_time::microsec_clock::local_time();
    //  timer start - access
    for (int i = 0; i < SIZE; i++) {
        index = rand()%SIZE;
        for (int j = 0; j < SIZE; j++) {

            raw2d[index][index]++;
        }
    }
    end = boost::posix_time::microsec_clock::local_time();
    msdiff = end - start_total;
    cout << "Array total time: " << msdiff.total_milliseconds() << "milliseconds.\n";
    msdiff = end - start_acess;
    cout << "Array access time: " << msdiff.total_milliseconds() << "milliseconds.\n"; 
    for (int i = 0; i < SIZE; i++) {
        delete [] raw2d[i];
    }
    return 0;
}

输出结果为：

    Vector total time: 925milliseconds.
    Vector access time: 4milliseconds.
    Array total time: 30milliseconds.
    Array access time: 21milliseconds.

如果使用得当，速度几乎相同（正如其他人提到的使用reserve()或resize()一样）。

- Zachary Kraus · Answer 8

顺便说一下，使用向量类时出现的减速问题也会出现在标准类型（如int）中。这是一个多线程代码：

#include <iostream>
#include <cstdio>
#include <map>
#include <string>
#include <typeinfo>
#include <vector>
#include <pthread.h>
#include <sstream>
#include <fstream>
using namespace std;

//pthread_mutex_t map_mutex=PTHREAD_MUTEX_INITIALIZER;

long long num=500000000;
int procs=1;

struct iterate
{
    int id;
    int num;
    void * member;
    iterate(int a, int b, void *c) : id(a), num(b), member(c) {}
};

//fill out viterate and piterate
void * viterate(void * input)
{
    printf("am in viterate\n");
    iterate * info=static_cast<iterate *> (input);
    // reproduce member type
    vector<int> test= *static_cast<vector<int>*> (info->member);
    for (int i=info->id; i<test.size(); i+=info->num)
    {
        //printf("am in viterate loop\n");
        test[i];
    }
    pthread_exit(NULL);
}

void * piterate(void * input)
{
    printf("am in piterate\n");
    iterate * info=static_cast<iterate *> (input);;
    int * test=static_cast<int *> (info->member);
    for (int i=info->id; i<num; i+=info->num) {
        //printf("am in piterate loop\n");
        test[i];
    }
    pthread_exit(NULL);
}

int main()
{
    cout<<"producing vector of size "<<num<<endl;
    vector<int> vtest(num);
    cout<<"produced  a vector of size "<<vtest.size()<<endl;
    pthread_t thread[procs];

    iterate** it=new iterate*[procs];
    int ans;
    void *status;

    cout<<"begining to thread through the vector\n";
    for (int i=0; i<procs; i++) {
        it[i]=new iterate(i, procs, (void *) &vtest);
    //  ans=pthread_create(&thread[i],NULL,viterate, (void *) it[i]);
    }
    for (int i=0; i<procs; i++) {
        pthread_join(thread[i], &status);
    }
    cout<<"end of threading through the vector";
    //reuse the iterate structures

    cout<<"producing a pointer with size "<<num<<endl;
    int * pint=new int[num];
    cout<<"produced a pointer with size "<<num<<endl;

    cout<<"begining to thread through the pointer\n";
    for (int i=0; i<procs; i++) {
        it[i]->member=&pint;
        ans=pthread_create(&thread[i], NULL, piterate, (void*) it[i]);
    }
    for (int i=0; i<procs; i++) {
        pthread_join(thread[i], &status);
    }
    cout<<"end of threading through the pointer\n";

    //delete structure array for iterate
    for (int i=0; i<procs; i++) {
        delete it[i];
    }
    delete [] it;

    //delete pointer
    delete [] pint;

    cout<<"end of the program"<<endl;
    return 0;
}

从代码行为上可以看出，向量实例化是代码中最耗时的部分。一旦你解决了这个瓶颈，剩下的代码将非常快速地运行。不管你运行多少个线程都是如此。

顺便提一下，请忽略绝对疯狂的include数量。我一直在使用这段代码来测试项目，因此包含的头文件数量不断增加。

- Íhor Mé · Answer 9

在我的经验中，有时候，vector<int> 比 int[] 慢很多倍。需要记住的一件事是，向量的向量与 int[][] 非常不同。因为元素可能不是连续的内存。这意味着您可以调整主向量中的不同向量的大小，但 CPU 可能无法像在 int[][] 的情况下那样缓存元素。

- Roger Pate · Answer 10

0

通过正确的选项，向量和数组可以生成相同的汇编代码。在这些情况下，它们当然是相同的速度，因为你无论哪种方式都会得到相同的可执行文件。

- Roger Pate

1

在这种情况下，它们似乎不会生成相同的汇编代码。特别是，似乎没有办法使用向量来抑制对构造函数的调用。您可以参考此处的答案解决该问题（虽然需要阅读很长时间，但确实解释了为什么除了您提供的链接中的简单测试用例之外，还存在性能差异的情况）。实际上，似乎有一种方法——编写自定义STL分配器，如建议所示。个人认为，与使用malloc相比，这样做是不必要的更多工作。 - kizzx2

1

@kizzx2：你必须这样费尽周折地使用未构造对象，这是一件好事，因为99%的时间（我可能严重低估了）都会出现错误。我确实阅读了其他答案，并意识到我没有解决你的具体情况（没有必要，其他答案是正确的），但我仍然想为你提供一个例子，展示向量和数组可以完全相同的行为。 - Roger Pate