C++ 数组访问优化

5

我正在修改一个开源项目(powder toy),并且始终使用/O2(最大化速度)选项进行编译,启用了SSE2代码生成,并刚刚检查了这个:

void membwand(void * destv, void * srcv, size_t destsize, size_t srcsize)
{
    size_t i;
    unsigned char * dest = (unsigned char*)destv;
    unsigned char * src = (unsigned char*)srcv;
    for(i = 0; i < destsize; i++){
        dest[i] = dest[i] & src[i%srcsize];
    }
}

这里我替换了以下行

membwand(gravy, gravmask, size_dst, size_src);

membwand(gravx, gravmask, size_dst, size_src);

    // gravy, gravx and gravmask are 1MB each

使用

membwand2(gravy,gravx, gravmask, size_dst, size_src);   
    // sizes are same, why not put all in a single function?

这是实现为:

void membwand2(void * destv1, void * destv2,void * srcv, size_t destsize, size_t srcsize)
{
    if(destsize!=srcsize)
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;
        unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        for(i = 0; i < destsize; i++)
        {
            dest1[i] = dest1[i] & src[i%srcsize];
            dest2[i] = dest2[i] & src[i%srcsize];
        }
    }
    else
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        unsigned char tmpChar0=0;unsigned char tmpChar1=0;unsigned char tmpChar2=0;
        unsigned char tmpChar3=0;unsigned char tmpChar4=0;unsigned char tmpChar5=0;
        unsigned char tmpChar6=0;unsigned char tmpChar7=0;unsigned char tmpChar8=0;
        unsigned char tmpChar9=0;unsigned char tmpChar10=0;unsigned char tmpChar11=0;
        unsigned char tmpChar12=0;unsigned char tmpChar13=0;unsigned char tmpChar14=0;
        unsigned char tmpChar15=0;unsigned char tmpChar16=0;unsigned char tmpChar17=0;
        unsigned char tmpChar18=0;unsigned char tmpChar19=0;unsigned char tmpChar20=0;
        unsigned char tmpChar21=0;unsigned char tmpChar22=0;unsigned char tmpChar23=0;
        unsigned char tmpChar24=0;unsigned char tmpChar25=0;unsigned char tmpChar26=0;
        unsigned char tmpChar27=0;unsigned char tmpChar28=0;unsigned char tmpChar29=0;
        unsigned char tmpChar30=0;unsigned char tmpChar31=0;
        for(i = 0; i < destsize; i+=32)
        {
            tmpChar0=src[i];tmpChar1=src[i+1];tmpChar2=src[i+2];tmpChar3=src[i+3];
            tmpChar4=src[i+4];tmpChar5=src[i+5];tmpChar6=src[i+6];tmpChar7=src[i+7];

            tmpChar8=src[i+8];tmpChar9=src[i+9];tmpChar10=src[i+10];tmpChar11=src[i+11];
            tmpChar12=src[i+12];tmpChar13=src[i+13];tmpChar14=src[i+14];tmpChar15=src[i+15];

            tmpChar16=src[i+16];tmpChar17=src[i+17];tmpChar18=src[i+18];tmpChar19=src[i+19];
            tmpChar20=src[i+20];tmpChar21=src[i+21];tmpChar22=src[i+22];tmpChar23=src[i+23];

            tmpChar24=src[i+24];tmpChar25=src[i+25];tmpChar26=src[i+26];tmpChar27=src[i+27];
            tmpChar28=src[i+28];tmpChar29=src[i+29];tmpChar30=src[i+30];tmpChar31=src[i+31];


            dest1[i] = dest1[i] & tmpChar0;
            dest1[i+1] = dest1[i+1] & tmpChar1;
            dest1[i+2] = dest1[i+2] & tmpChar2;
            dest1[i+3] = dest1[i+3] & tmpChar3;
            dest1[i+4] = dest1[i+4] & tmpChar4;
            dest1[i+5] = dest1[i+5] & tmpChar5;
            dest1[i+6] = dest1[i+6] & tmpChar6;
            dest1[i+7] = dest1[i+7] & tmpChar7;
            dest1[i+8] = dest1[i+8] & tmpChar8;
            dest1[i+9] = dest1[i+9] & tmpChar9;
            dest1[i+10] = dest1[i+10] & tmpChar10;
            dest1[i+11] = dest1[i+11] & tmpChar11;
            dest1[i+12] = dest1[i+12] & tmpChar12;
            dest1[i+13] = dest1[i+13] & tmpChar13;
            dest1[i+14] = dest1[i+14] & tmpChar14;
            dest1[i+15] = dest1[i+15] & tmpChar15;
            dest1[i+16] = dest1[i+16] & tmpChar16;
            dest1[i+17] = dest1[i+17] & tmpChar17;
            dest1[i+18] = dest1[i+18] & tmpChar18;
            dest1[i+19] = dest1[i+19] & tmpChar19;
            dest1[i+20] = dest1[i+20] & tmpChar20;
            dest1[i+21] = dest1[i+21] & tmpChar21;
            dest1[i+22] = dest1[i+22] & tmpChar22;
            dest1[i+23] = dest1[i+23] & tmpChar23;
            dest1[i+24] = dest1[i+24] & tmpChar24;
            dest1[i+25] = dest1[i+25] & tmpChar25;
            dest1[i+26] = dest1[i+26] & tmpChar26;
            dest1[i+27] = dest1[i+27] & tmpChar27;
            dest1[i+28] = dest1[i+28] & tmpChar28;
            dest1[i+29] = dest1[i+29] & tmpChar29;
            dest1[i+30] = dest1[i+30] & tmpChar30;
            dest1[i+31] = dest1[i+31] & tmpChar31;

            dest2[i] = dest2[i] & tmpChar0;
            dest2[i+1] = dest2[i+1] & tmpChar1;
            dest2[i+2] = dest2[i+2] & tmpChar2;
            dest2[i+3] = dest2[i+3] & tmpChar3;
            dest2[i+4] = dest2[i+4] & tmpChar4;
            dest2[i+5] = dest2[i+5] & tmpChar5;
            dest2[i+6] = dest2[i+6] & tmpChar6;
            dest2[i+7] = dest2[i+7] & tmpChar7;
            dest2[i+8] = dest2[i+8] & tmpChar8;
            dest2[i+9] = dest2[i+9] & tmpChar9;
            dest2[i+10] = dest2[i+10] & tmpChar10;
            dest2[i+11] = dest2[i+11] & tmpChar11;
            dest2[i+12] = dest2[i+12] & tmpChar12;
            dest2[i+13] = dest2[i+13] & tmpChar13;
            dest2[i+14] = dest2[i+14] & tmpChar14;
            dest2[i+15] = dest2[i+15] & tmpChar15;
            dest2[i+16] = dest2[i+16] & tmpChar16;
            dest2[i+17] = dest2[i+17] & tmpChar17;
            dest2[i+18] = dest2[i+18] & tmpChar18;
            dest2[i+19] = dest2[i+19] & tmpChar19;
            dest2[i+20] = dest2[i+20] & tmpChar20;
            dest2[i+21] = dest2[i+21] & tmpChar21;
            dest2[i+22] = dest2[i+22] & tmpChar22;
            dest2[i+23] = dest2[i+23] & tmpChar23;
            dest2[i+24] = dest2[i+24] & tmpChar24;
            dest2[i+25] = dest2[i+25] & tmpChar25;
            dest2[i+26] = dest2[i+26] & tmpChar26;
            dest2[i+27] = dest2[i+27] & tmpChar27;
            dest2[i+28] = dest2[i+28] & tmpChar28;
            dest2[i+29] = dest2[i+29] & tmpChar29;
            dest2[i+30] = dest2[i+30] & tmpChar30;
            dest2[i+31] = dest2[i+31] & tmpChar31;


        }
    }
}

Omp墙钟时间对于两行版本为17ms,对于单行版本为1.5ms。(经过几千次迭代后)
问题:是什么导致了9倍的加速?src[]的数据重用或者缓存利用率?也许代码之前不适合向量化,展开使其可行,是否缺少索引的模运算?我应该在循环顶部添加#pragma omp吗?
编辑:#pragma omp parallel for num_threads(4) 使情况变得更糟。也许只有1MB无法隐藏多线程开销?
编辑2:省略模运算符使其为2.5毫秒,但性能从2.5毫秒到1.5毫秒(增加了约60%)必须来自展开/缓存/...
注意:启用“优先快速代码”,启用整个程序优化,完全优化(/Ox)和启用内置函数都没有改变速度(也许启用SSE2和/O2就足够了)
CPU:FX8150 IDE:Msvc

1
您可以使用/Qvec-report:1/Qvec-report:2来检查自动向量化。 - Z boson
1
@huseyintugrulbuyukisik 好的,但是如果不知道您正在移动的数据的结构定义,很难说。但我预计您的编译器已经尽最大努力对齐数据以实现最快的访问速度。但是,如果您正在深入微观优化,那么您也应该考虑这一点。 - Peter M
1
我希望你在进行比较时是在苹果和苹果之间进行比较,也就是说,需要比较调用 membwand(...) 和调用 membwand2(...) 的时间,因为你需要检查移动相同数据量所需的时间。 - Pandrei
1
[OT]由于“destsize”似乎是“32”的倍数,如果对齐允许,则可以直接使用“uint32_t”(或“uint64_t”)……。 - Jarod42
1
@huseyintugrulbuyukisik 您正在执行二进制 and 操作,而不关心数据的值,因此 endianness 应该是无关紧要的。 - Peter M
显示剩余10条评论
2个回答

3

我认为循环展开是提速的关键。你可以谷歌搜索以获取详细信息。

当然,模运算可能会很慢...你可以通过替换它来进行测试

src[i%srcsize] 

src[i] 

为了进行时间测试。


只是省略了模数运算,使得墙上计时变为2.5毫秒。因此,2.5毫秒-1.5毫秒的差异必须是由于展开和其他因素引起的。谢谢。 - huseyin tugrul buyukisik

2

在比较这两个函数时,有几个方面需要考虑;从高层次的角度来看,似乎两行版本应该比一行版本运行得更快,但是:

- the 1 line version has better instruction cache locatity
- the 1 line version puts less pressure on the registers( this is important 
 because if the compiler runs out of registers to use it may start using the 
 stack to move parameters around for the instructions it needs - which causes 
 an overhead)

作为一条优化提示:
- you can try and use the "restrict"   keyword: **void membwand(void * _restrict_
destv, void *  _restrict_ srcv, size_t destsize, size_t srcsize);** This tells the
compiler that the two pointers are not aliasing - they are will never point at the
same memory location so the read/write instruction can be executed in the same cycle.

尝试使用void *限制,但性能没有改变。也许我需要使用int而不是char? - huseyin tugrul buyukisik
不,数据类型与此无关;这只是向编译器传递额外信息的一种方式。 - Pandrei

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接