我一直在尝试将一个特定的应用程序向量化,但是我已经尝试了所有方法。从自动向量化到手写的SSE内部函数,但是不知何故,在基于stencil的应用程序上无法获得加速。
以下是我的当前代码片段,我使用了SSE指令集进行向量化。当我使用-vec-report3编译它(Intel icc)时,我不断收到以下消息:
remark: loop was not vectorized: statement cannot be vectorized.
#pragma ivdep
for ( i = STENCIL; i < z - STENCIL; i+=4 )
{
it = it2 + i;
__m128 tmp2i = _mm_mul_ps(_mm_add_ps(_mm_load_ps(&p2[i+j*it_j-it_j4+k*it_k]),_mm_load_ps(&p2[i+j*it_j+it_j4+k*it_k])),X4_i); //loop was not vectorized: statement cannot be vectorized
__m128 tmp3 = _mm_mul_ps(_mm_add_ps(_mm_load_ps(&p2[i+j*it_j-it_j3+k*it_k]),_mm_load_ps(&p2[i+j*it_j+it_j3+k*it_k])),X3_i);
__m128 tmp4 = _mm_mul_ps(_mm_add_ps(_mm_load_ps(&p2[i+j*it_j-it_j2+k*it_k]),_mm_load_ps(&p2[i+j*it_j+it_j2+k*it_k])),X2_i);
__m128 tmp5 = _mm_mul_ps(_mm_add_ps(_mm_load_ps(&p2[i+j*it_j-it_j +k*it_k]),_mm_load_ps(&p2[i+j*it_j+it_j +k*it_k])),X1_i);
__m128 tmp6 = _mm_add_ps(_mm_add_ps(_mm_add_ps(tmp2i,tmp3),_mm_add_ps(tmp4,tmp5)), _mm_mul_ps(_mm_load_ps(&p2[it]),C00_i));
_mm_store_ps(&tmp2[i],tmp6);
}
我是否遗漏了什么关键的东西?由于该消息没有详细说明为什么它不能被向量化,我很难确定瓶颈所在。
更新: 经过认真考虑建议,我按照以下方式调整了代码。我认为将其进一步分解以识别实际负责向量相关性的语句是最好的做法。
//#pragma ivdep
for ( i = STENCIL; i < z - STENCIL; i+=4 )
{
it = it2 + i;
__m128 center = _mm_mul_ps(_mm_load_ps(&p2[it]),C00_i);
u_j4 = _mm_load_ps(&p2[i+j*it_j-it_j4+k*it_k]); //Line 180
u_j3 = _mm_load_ps(&p2[i+j*it_j-it_j3+k*it_k]);
u_j2 = _mm_load_ps(&p2[i+j*it_j-it_j2+k*it_k]);
u_j1 = _mm_load_ps(&p2[i+j*it_j-it_j +k*it_k]);
u_j8 = _mm_load_ps(&p2[i+j*it_j+it_j4+k*it_k]);
u_j7 = _mm_load_ps(&p2[i+j*it_j+it_j3+k*it_k]);
u_j6 = _mm_load_ps(&p2[i+j*it_j+it_j2+k*it_k]);
u_j5 = _mm_load_ps(&p2[i+j*it_j+it_j +k*it_k]);
__m128 tmp2i = _mm_mul_ps(_mm_add_ps(u_j4,u_j8),X4_i);
__m128 tmp3 = _mm_mul_ps(_mm_add_ps(u_j3,u_j7),X3_i);
__m128 tmp4 = _mm_mul_ps(_mm_add_ps(u_j2,u_j6),X2_i);
__m128 tmp5 = _mm_mul_ps(_mm_add_ps(u_j1,u_j5),X1_i);
__m128 tmp6 = _mm_add_ps(_mm_add_ps(tmp2i,tmp3),_mm_add_ps(tmp4,tmp5));
__m128 tmp7 = _mm_add_ps(tmp6,center);
_mm_store_ps(&tmp2[i],tmp7); //Line 196
}
如果我在没有使用 #pragma ivdep
的情况下编译(icc)上面的代码,我会收到以下信息:
remark: loop was not vectorized: existence of vector dependence.
vector dependence: assumed FLOW dependence between tmp2 line 196 and tmp2 line 196.
vector dependence: assumed ANTI dependence between tmp2 line 196 and tmp2 line 196.
当我使用#pragma ivdep
编译(icc)时,我得到如下消息:
remark: loop was not vectorized: unsupported data type. //Line 180
为什么在第196行建议存在依赖关系?如何消除建议的向量依赖关系?