比较blitz++、armadillo和boost::MultiArray

Question

比较blitz++、armadillo和boost::MultiArray

c++multidimensional-arrayarmadilloboost-multi-arrayblitz++

36

我使用了以下代码（从一个旧帖子借用），对blitz++、armadillo和boost::MultiArray进行了比较：

#include <iostream>
using namespace std;
#include <windows.h>
#define _SCL_SECURE_NO_WARNINGS
#define BOOST_DISABLE_ASSERTS 
#include <boost/multi_array.hpp>
#include <blitz/array.h>
#include <armadillo>

int main(int argc, char* argv[])
{
    const int X_SIZE = 1000;
    const int Y_SIZE = 1000;
    const int ITERATIONS = 100;
    unsigned int startTime = 0;
    unsigned int endTime = 0;

    // Create the boost array


    //------------------Measure boost Loop------------------------------------------
    {
        typedef boost::multi_array<double, 2> ImageArrayType;
        ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    boostMatrix[x][y] = 1.0001;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[Boost Loop] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }
    //------------------Measure blitz Loop-------------------------------------------
    {
        blitz::Array<double, 2> blitzArray( X_SIZE, Y_SIZE );
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    blitzArray(x,y) = 1.0001;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[Blitz Loop] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure armadillo loop----------------------------------------
    {
        arma::mat matArray( X_SIZE, Y_SIZE );
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                for (int x = 0; x < X_SIZE; ++x)
                {
                    matArray(x,y) = 1.0001;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[arma  Loop]  Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure native loop----------------------------------------
    // Create the native array
    {
        double *nativeMatrix = new double [X_SIZE * Y_SIZE];
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
            {
                nativeMatrix[y] = 1.0001;
            }
        }
        endTime = ::GetTickCount();
        printf("[Native Loop]Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
        delete[] nativeMatrix;
    }

    //------------------Measure boost computation-----------------------------------
    {
        typedef boost::multi_array<double, 2> ImageArrayType;
        ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
        for (int x = 0; x < X_SIZE; ++x)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                boostMatrix[x][y] = 1.0001;
            }
        }
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    boostMatrix[x][y] += boostMatrix[x][y] * 0.5;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[Boost computation] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure blitz computation-----------------------------------
    {
        blitz::Array<double, 2> blitzArray( X_SIZE, Y_SIZE );
        blitzArray = 1.0001;
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            blitzArray += blitzArray*0.5;
        }
        endTime = ::GetTickCount();
        printf("[Blitz computation] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure armadillo computation-------------------------------
    {
        arma::mat matArray( X_SIZE, Y_SIZE );
        matArray.fill(1.0001);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            //matArray.fill(1.0001);
            matArray += matArray*0.5;
        }
        endTime = ::GetTickCount();
        printf("[arma  computation] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure native computation------------------------------------------
    // Create the native array
    {
        double *nativeMatrix = new double [X_SIZE * Y_SIZE];
        for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
        {
            nativeMatrix[y] = 1.0001;
        }
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
            {
                nativeMatrix[y] += nativeMatrix[y] * 0.5;
            }
        }
        endTime = ::GetTickCount();
        printf("[Native computation]Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
        delete[] nativeMatrix;
    }

    return 0;
}

在Windows上，使用VS2010，结果如下：

[Boost Loop] Elapsed time:  1.217 seconds
[Blitz Loop] Elapsed time:  0.046 seconds
[arma  Loop]  Elapsed time:  0.078 seconds
[Native Loop]Elapsed time:  0.172 seconds
[Boost computation] Elapsed time:  2.152 seconds
[Blitz computation] Elapsed time:  0.156 seconds
[arma  computation] Elapsed time:  0.078 seconds
[Native computation]Elapsed time:  0.078 seconds

在Windows上使用Intel C++的结果如下：

[Boost Loop] Elapsed time:  0.468 seconds
[Blitz Loop] Elapsed time:  0.125 seconds
[arma  Loop]  Elapsed time:  0.046 seconds
[Native Loop]Elapsed time:  0.047 seconds
[Boost computation] Elapsed time:  0.796 seconds
[Blitz computation] Elapsed time:  0.109 seconds
[arma  computation] Elapsed time:  0.078 seconds
[Native computation]Elapsed time:  0.062 seconds

有些奇怪：

(1) with VS2010, native computation (including loop) is faster than native loop
(2) blitz loop behave so different under VS2010 and intel C++.

要使用Intel C++编译器编译Blitz++，需要在blitz/intel/文件夹中提供一个名为bzconfig.h的文件。但是这个文件不存在。我只是把blitz / ms / bzconfig.h中的文件复制进去了，但这可能会导致非最优配置。请问有谁知道如何使用Intel C++编译器编译Blitz ++吗？手册中说要运行bzconfig脚本以获取正确的bzconfig.h文件，但我不明白它的意思。

非常感谢！

我的结论：

1. Boost multi array is the slowest.
2. With intel c++ compiler, native pointers are very fast.
3. With intel c++ compiler,  armadillo can achieve the performance of native pointers.
4. Also test eigen, it is x0% slower than armadillo in my simple cases.
5. Curious about blitz++'s behavior in intel c++ compiler with proper configuration.
   Please see my question.

- user1899020

6

请注意，默认情况下Armadillo启用了边界检查（以及其他有用的合理性检查）。这样做的原因是先确保算法正确，然后再进行优化。你可以通过禁用边界检查来加速Armadillo的运行速度，具体请参见文档此处和此处。基本上，这相当于在包含armadillo头文件之前定义ARMA_NO_DEBUG，类似于BOOST_DISABLE_ASSERTS的方式。 - mtall

1

你尝试过改变顺序，使其按照本地、arma、blitz、boost的顺序读取吗？ - Jens Munk

2

所以，你唯一的问题是“有人能告诉我如何使用英特尔C++编译器编译blitz++吗？”如果是这样，请提供详细信息，我们不需要其余的内容 - 它应该放在博客文章中，或者如果你正在征求反馈意见，则可以放在http://codereview.stackexchange.com上。 - Tony Delroy

你是否通过在config.hpp中定义ARMA_USE_LAPACK和ARMA_USE_BLAS来替换arma的默认BLAS和LAPACK库？使用OpenBLAS可以获得多线程性能。使用单个赋值给元素的测试不是一个好方法，因为这是一种非常低效的填充矩阵的方式，而且库也不会针对这种方式进行优化。 - flohack

1

许多这样的帖子，包括你所提到的那个，都没有进行恰当的比较。你引用的帖子的最新回复 https://dev59.com/L3RB5IYBdhLWcg3w-8E9#539001 表明，如果针对 Boost 进行优化的话，multi_array 可以与本地数组实现相媲美。 - alle_meije

显示剩余2条评论

4个回答

5

据我所知，您正在通过测量将单个矩阵乘以标量的速度来评估每个矩阵库的性能。由于其基于模板的策略，Armadillo在这方面做得非常好，通过为大多数编译器分解每个乘法为可并行化的代码。

但是我建议您需要重新考虑测试范围和方法。例如，您遗漏了每个BLAS实现。您需要的BLAS函数将是dscal。特定CPU的供应商提供的实现可能会很好地完成工作。

更相关的是，任何合理的向量库都需要能够执行许多其他操作：矩阵乘法、点积、向量长度、转置等，这些都没有在您的测试中得到解决。您的测试只涉及两件事：元素赋值，在向量库中实际上从来不是瓶颈，以及标量/向量乘法，这是由每个CPU制造商提供的BLAS 1级函数。

这里讨论了BLAS一级和编译器生成的代码之间的区别。

简而言之，在您的平台上使用已链接的BLAS和LAPACK本地库的Armadillo。

- johnwbyrd

2

使用“_成年人讨论……_”这样的措辞显得非常轻蔑和不专业。 - hbrerkere

2

我的测试表明，Boost数组与本地/硬编码的C++代码具有相同的性能。

您需要使用激活的编译器优化进行比较。也就是说： -O3 -DNDEBUG -DBOOST_UBLAS_NDEBUG -DBOOST_DISABLE_ASSERTS -DARMA_NO_DEBUG ... 当我进行测试（em ++）时，如果您停用其断言，启用级别3优化，使用-O3等，则Boost的性能至少快10倍。任何公平的比较都应使用这些标志。

- Sohail Si

0

出于好奇，我重新进行了这个速度测试。

我为Eigen库和我的自己的库MULTI（https://gitlab.com/correaa/boost-multi）添加了测试。

完整的代码适配到Linux后，在帖子末尾如下所示。

实现了一个多维数组库后，重要的是要考虑到大多数这些库都带有可选择禁用的边界检查。

此外，需要注意的是，计时测量至少需要防止编译器优化掉没有副作用的代码。因此，我添加了一个doNotOptimizeAway函数来防止这种情况发生。（请参见其代码在末尾。）还需要知道从哪里调用这个函数（我认为是在重复循环内部）。

以下是结果：

没有优化（这不是一个好的测试；它只是展示了在调试模式下运行这些测量有多糟糕）

$ g++ a.cpp && ./a.out
[Boost Loop] Elapsed time:  7.216 seconds
[Blitz Loop] Elapsed time:  1.151 seconds
[arma  Loop]  Elapsed time:  0.747 seconds
[Native Loop] Elapsed time:  0.319 seconds
[EIGEN Loop] Elapsed time:  9.022 seconds
[MULTI Loop] Elapsed time:  7.769 seconds

[Boost computation] Elapsed time: 15.456 seconds
[Blitz computation] Elapsed time:  4.441 seconds
[arma  computation] Elapsed time:  0.662 seconds
[Native computation] Elapsed time:  0.340 seconds
[EIGEN computation] Elapsed time: 18.714 seconds
[MULTI computation] Elapsed time: 15.434 seconds

优化后：

$ sudo cpupower frequency-set --governor performance
$ g++ -O3 -DNDEBUG a.cpp && sudo nice -n -10 ./a.out
[Boost Loop] Elapsed time:  0.024 seconds
[Blitz Loop] Elapsed time:  0.027 seconds
[arma  Loop]  Elapsed time:  0.052 seconds
[Native Loop] Elapsed time:  0.023 seconds
[EIGEN Loop] Elapsed time:  0.138 seconds
[MULTI Loop] Elapsed time:  0.031 seconds

[Boost computation] Elapsed time:  0.059 seconds
[Blitz computation] Elapsed time:  0.054 seconds
[arma  computation] Elapsed time:  0.061 seconds
[Native computation] Elapsed time:  0.058 seconds
[EIGEN computation] Elapsed time:  0.196 seconds
[MULTI computation] Elapsed time:  0.060 seconds

使用快速数学：

$ g++ -O3 -DNDEBUG a.cpp && sudo nice -n -10 ./a.out
[Boost Loop] Elapsed time:  0.024 seconds
[Blitz Loop] Elapsed time:  0.028 seconds
[arma  Loop]  Elapsed time:  0.051 seconds
[Native Loop] Elapsed time:  0.024 seconds
[EIGEN Loop] Elapsed time:  0.139 seconds
[MULTI Loop] Elapsed time:  0.027 seconds

[Boost computation] Elapsed time:  0.040 seconds
[Blitz computation] Elapsed time:  0.033 seconds
[arma  computation] Elapsed time:  0.043 seconds
[Native computation] Elapsed time:  0.039 seconds
[EIGEN computation] Elapsed time:  0.148 seconds
[MULTI computation] Elapsed time:  0.041 seconds

在这里提到了用于禁用边界检查的宏：

$ sudo cpupower frequency-set --governor performance
$ g++ -march=native -mtune=native -Ofast -DNDEBUG -DARMA_NO_DEBUG -DBOOST_DISABLE_ASSERTS-DARMA_USE_LAPACK -DARMA_USE_BLAS -DBOOST_UBLAS_NDEBUG -DARMA_NO_DEBUG -DEIGEN_NO_DEBUG a.cpp && sudo nice -n -10 ./a.out
[Boost Loop] Elapsed time:  0.024 seconds
[Blitz Loop] Elapsed time:  0.024 seconds
[arma  Loop]  Elapsed time:  0.022 seconds
[Native Loop] Elapsed time:  0.022 seconds
[EIGEN Loop] Elapsed time:  0.131 seconds
[MULTI Loop] Elapsed time:  0.024 seconds

[Boost computation] Elapsed time:  0.025 seconds
[Blitz computation] Elapsed time:  0.023 seconds
[arma  computation] Elapsed time:  0.045 seconds
[Native computation] Elapsed time:  0.024 seconds
[EIGEN computation] Elapsed time:  0.132 seconds
[MULTI computation] Elapsed time:  0.024 seconds

使用clang，结果有些令人困惑（我的库和Blitz表现得更糟！）

$ clang++ -std=c++17 -march=native -mtune=native -Ofast -DNDEBUG -DARMA_NO_DEBUG -DBOOST_DISABLE_ASSERTS-DARMA_USE_LAPACK -DARMA_USE_BLAS -DBOOST_UBLAS_NDEBUG -DARMA_NO_DEBUG -DEIGEN_NO_DEBUG a.cpp && sudo nice -n -10 ./a.out
[Boost Loop] Elapsed time:  0.025 seconds
[Blitz Loop] Elapsed time:  0.024 seconds
[arma  Loop]  Elapsed time:  0.023 seconds
[Native Loop] Elapsed time:  0.023 seconds
[EIGEN Loop] Elapsed time:  0.132 seconds
[MULTI Loop] Elapsed time:  0.069 seconds

[Boost computation] Elapsed time:  0.024 seconds
[Blitz computation] Elapsed time:  0.055 seconds
[arma  computation] Elapsed time:  0.023 seconds
[Native computation] Elapsed time:  0.023 seconds
[EIGEN computation] Elapsed time:  0.124 seconds
[MULTI computation] Elapsed time:  0.087 seconds

所以，就是这样。看起来，如今最重要的因素是知道如何进行优化编译、禁用调试、选择编译器，并且知道如何编写定时测试，而不仅仅是选择库（至少对于这些简单的使用模式）。

Eigen在这方面仍然是一个例外；如果有人知道使用Eigen进行正确编译的选项，请告诉我。

我Ubuntu 23.04机器的更多细节如下：

g++ (Ubuntu 12.3.0-1ubuntu1~23.04) 12.3.0
clang version 15.0.7
libboost1.81-all-dev/lunar 1.81.0-4build2 amd64
libblitz0-dev/lunar,now 1:1.0.2+ds-4 amd64 [installed]
libarmadillo-dev/lunar,now 1:11.4.2+dfsg-1 amd64 [installed]
libeigen3-dev/lunar,now 3.4.0-4 all [installed]
MULTI v0.80.1 from https://gitlab.com/correaa/boost-multi

CPU: Intel® Core™ i7-9750H × 12

完整代码：

#include <iostream>
using namespace std;
#define _SCL_SECURE_NO_WARNINGS
#define BOOST_DISABLE_ASSERTS
#include <boost/multi_array.hpp>
#include <blitz/array.h>
#include <armadillo>
#include <eigen3/Eigen/Dense>
#include "/home/correaa/boost-multi/include/multi/array.hpp"

#include <chrono>
auto GetTickCount() {return chrono::steady_clock::now();}

template <class T>
void doNotOptimizeAway(T&& t) {
    __asm__ __volatile__ ("" :: "g" (t));
}

int main(int argc, char* argv[])
{
    const int X_SIZE = 1000;
    const int Y_SIZE = 1000;
    double factor = 10;  // had to do more iteration to get more steady results, the timings are normalized still
    const int ITERATIONS = 100*10;
    auto startTime = ::GetTickCount();
    auto endTime = ::GetTickCount();

    // Create the boost array


    //------------------Measure boost Loop------------------------------------------
    {
        typedef boost::multi_array<double, 2> ImageArrayType;
        ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    boostMatrix[x][y] = 1.0001;
                }
            }
            doNotOptimizeAway(boostMatrix);
        }
        endTime = ::GetTickCount();
        printf("[Boost Loop] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }
    //------------------Measure blitz Loop-------------------------------------------
    {
        blitz::Array<double, 2> blitzArray( X_SIZE, Y_SIZE );
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    blitzArray(x,y) = 1.0001;
                }
            }
            doNotOptimizeAway(blitzArray);
        }
        endTime = ::GetTickCount();
        printf("[Blitz Loop] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }

    //------------------Measure armadillo loop----------------------------------------
    {
        arma::mat matArray( X_SIZE, Y_SIZE );
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                for (int x = 0; x < X_SIZE; ++x)
                {
                    matArray(x,y) = 1.0001;
                }
            }
            doNotOptimizeAway(matArray);
        }
        endTime = ::GetTickCount();
        printf("[arma  Loop]  Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }

    //------------------Measure native loop----------------------------------------
    // Create the native array
    {
        double *nativeMatrix = new double [X_SIZE * Y_SIZE];
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
            {
                nativeMatrix[y] = 1.0001;
            }
            doNotOptimizeAway(nativeMatrix);
        }
        endTime = ::GetTickCount();
        printf("[Native Loop] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
        delete[] nativeMatrix;
    }
    //------------------Measure EIGEN Loop------------------------------------------
    {
        typedef Eigen::MatrixXd ImageArrayType;
        ImageArrayType eigenMatrix(X_SIZE, Y_SIZE);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    eigenMatrix(x, y) = 1.0001;
                }
            }
            doNotOptimizeAway(eigenMatrix);
        }
        endTime = ::GetTickCount();
        printf("[EIGEN Loop] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }
    //------------------Measure Multi Loop------------------------------------------
    {
        typedef boost::multi::array<double, 2> ImageArrayType;
        ImageArrayType multiMatrix({X_SIZE, Y_SIZE});
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    multiMatrix[x][y] = 1.0001;
                }
            }
            doNotOptimizeAway(multiMatrix);
        }
        endTime = ::GetTickCount();
        printf("[MULTI Loop] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }


    //------------------Measure boost computation-----------------------------------
    {
        typedef boost::multi_array<double, 2> ImageArrayType;
        ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
        for (int x = 0; x < X_SIZE; ++x)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                boostMatrix[x][y] = 1.0001;
            }
        }
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    boostMatrix[x][y] += boostMatrix[x][y] * 0.5;
                }
            }
            doNotOptimizeAway(boostMatrix);
        }
        endTime = ::GetTickCount();
        printf("[Boost computation] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }

    //------------------Measure blitz computation-----------------------------------
    {
        blitz::Array<double, 2> blitzArray( X_SIZE, Y_SIZE );
        blitzArray = 1.0001;
        doNotOptimizeAway(blitzArray);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            blitzArray += blitzArray*0.5;
            doNotOptimizeAway(blitzArray);
        }
        endTime = ::GetTickCount();
        printf("[Blitz computation] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }

    //------------------Measure armadillo computation-------------------------------
    {
        arma::mat matArray( X_SIZE, Y_SIZE );
        matArray.fill(1.0001);
        doNotOptimizeAway(matArray);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            //matArray.fill(1.0001);
            matArray += matArray*0.5;
            doNotOptimizeAway(matArray);
        }
        endTime = ::GetTickCount();
        printf("[arma  computation] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }

    //------------------Measure native computation------------------------------------------
    // Create the native array
    {
        double *nativeMatrix = new double [X_SIZE * Y_SIZE];
        for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
        {
            nativeMatrix[y] = 1.0001;
        }
        doNotOptimizeAway(nativeMatrix);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
            {
                nativeMatrix[y] += nativeMatrix[y] * 0.5;
            }
            doNotOptimizeAway(nativeMatrix);
        }
        endTime = ::GetTickCount();
        printf("[Native computation] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
        delete[] nativeMatrix;
    }
    //------------------Measure EIGEN computation-----------------------------------
    {
        typedef Eigen::MatrixXd ImageArrayType;
        ImageArrayType eigenMatrix(X_SIZE, Y_SIZE);
        for (int x = 0; x < X_SIZE; ++x)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                eigenMatrix(x, y) = 1.0001;
            }
        }
        doNotOptimizeAway(eigenMatrix);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    eigenMatrix(x, y) += eigenMatrix(x, y) * 0.5;
                }
            }
            doNotOptimizeAway(eigenMatrix);
        }
        endTime = ::GetTickCount();
        printf("[EIGEN computation] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }
    //------------------Measure MULTI computation-----------------------------------
    {
        typedef boost::multi::array<double, 2> ImageArrayType;
        ImageArrayType multiMatrix({X_SIZE, Y_SIZE});
        for (int x = 0; x < X_SIZE; ++x)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                multiMatrix[x][y] = 1.0001;
            }
        }
        doNotOptimizeAway(multiMatrix);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    multiMatrix[x][y] += multiMatrix[x][y] * 0.5;
                }
            }
            doNotOptimizeAway(multiMatrix);
        }
        endTime = ::GetTickCount();
        printf("[MULTI computation] Elapsed time: %6.3f seconds\n", chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count() / 1000000000.0 / factor);
    }

    return 0;
}

- alfC

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Rainer Blome · Accepted Answer

简短回答: 通过阅读 Blitz++ 用户指南，找到 ./configure CXX=icpc 命令。

详细回答:

要使用英特尔 C++ 编译器编译 blitz++，需要在 blitz/intel/ 文件夹中有一个名为 bzconfig.h 的文件。但是没有这个文件。

没错，Blitz++ 应该会自动生成这个文件。根据 Blitz++ 用户指南（包含在 blitz-0.10.tar.gz 中的 blitz.pdf），"Installation" 章节所述，

Blitz++ 使用 GNU Autoconf 来重写各种平台和编译器的 Makefile。

更准确地说，Blitz++ 使用 GNU autotools 工具链（automake、autoconf、configure）生成 makefile、configure 脚本、头文件等。bzconfig.h 文件应该由随附的 configure 脚本生成，可以直接使用。

我只是复制了 blitz/ms/bzconfig.h 中的文件。这可能会导致非最优配置。

如果“非最优”对你来说意味着“不工作”，那么是的。:-) 您需要一个准确表示您的编译器的intel/bzconfig.h。

有人可以告诉我如何使用intel c++编译器编译blitz++吗？

请阅读并遵循精细手册，特别是上面提到的“安装”部分。

进入“blitz-VERSION”目录，然后键入： ./configure CXX=[compiler] 其中[compiler]是以下之一：xlc ++、icpc、pathCC、xlC、cxx、aCC、CC、g ++、KCC、pgCC或FCC。（如果您没有选择C ++编译器，则配置脚本将尝试为当前平台找到合适的编译器。）

您已经这样做了吗？对于Intel编译器，您需要使用./configure CXX=icpc。

在手册中，它说要运行bzconfig脚本以获取正确的bzconfig.h。但我不明白它的意思。

我猜你所说的“它”是指“那”。你所说的“手册”是什么意思？我的Blitz++用户指南副本中没有提到bzconfig。你确定你正在使用与你的Blitz++版本相对应的手册吗？

附注：在blitz-0.10的内容中查找“bzconfig”，看起来“bzconfig”不再是Blitz++的一部分，但曾经是： find . -name bzconfig -> 没有结果 find . -print0 | xargs -0 grep -a -i -n -e bzconfig:

./blitz/compiler.h:44:    #error  In <blitz/config.h>: A working template implementation is required by Blitz++ (you may need to rerun the compiler/bzconfig script)

那需要更新。

./blitz/gnu/bzconfig.h:4:/* blitz/gnu/bzconfig.h. Generated automatically at end of configure. */
./configure.ac:159:# autoconf replacement of bzconfig

就是这样，这些bzconfig.h文件应该由configure生成。

./ChangeLog.1:1787: will now replace the old file that was generate with the bzconfig

可能是转换到autoconf的更改。

./INSTALL:107:  2. Go into the compiler subdirectory and run the bzconfig

那需要更新。这是让你寻找bzconfig的原因吗？

./README:27:compiler      Compiler tests (used with obsolete bzconfig script)

需要更新，编译器目录已不再包含。