问题在于
mean += (data[i] - mean) / (i+1);
并不容易适用于SIMD。然而,通过仔细研究数学,可以在不太费力的情况下将其向量化。
关键公式是
mean(n+m) = (n*mean(n) + m*mean(m))/(n+m)
这段代码展示了如何添加 n
个数字的平均值和 m
个数字的平均值。这可以在你的运算符定义 RunningMean operator+(RunningMean& rhs)
中看到。这也解释了为什么你的并行代码能够正常工作。如果我们将你的 C++ 代码进行分解,我认为这会更加清晰明了:
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
#pragma omp for nowait
for(size_t i=0; i<N; i++) {
count_private ++;
mean_private += (data[i] - mean_private)/count_private;
}
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
但是我们可以使用相同的思路来处理SIMD(并将它们组合在一起)。但首先让我们只处理SIMD部分。使用AVX,我们可以同时处理四个并行均值。每个并行均值将处理以下数据元素:
mean 1 data elements: 0, 4, 8, 12,...
mean 2 data elements: 1, 5, 9, 13,...
mean 3 data elements: 2, 6, 10, 14,...
mean 4 data elements: 3, 7, 11, 15,...
当我们循环遍历所有元素后,我们将四个并行求和相加,然后除以四(因为每个求和都运行在N/4个元素上)。
以下是实现此操作的代码:
double mean = 0.0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
for(size_t i=0; i<N/4; i++) {
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean = _mm_cvtsd_f64(t3)/4;
int count = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count++;
mean2 += (data[i] - mean2)/count;
}
mean = (4*(N/4)*mean + count*mean2)/N;
最后,我们可以将这个与OpenMP结合起来,以此获得SIMD和MIMD的全部优势,代码如下:
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
#pragma omp for nowait
for(size_t i=0; i<N/4; i++) {
count_private++;
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean_private = _mm_cvtsd_f64(t3)/4;
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
int count2 = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count2++;
mean2 += (data[i] - mean2)/count2;
}
mean = (4*(N/4)*mean + count2*mean2)/N;
这里是一个可工作的示例(使用-O3 -mavx -fopenmp
编译)
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
double mean_simd(double *data, const int N) {
double mean = 0.0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
for(size_t i=0; i<N/4; i++) {
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean = _mm_cvtsd_f64(t3)/4;
int count = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count++;
mean2 += (data[i] - mean2)/count;
}
mean = (4*(N/4)*mean + count*mean2)/N;
return mean;
}
double mean_simd_omp(double *data, const int N) {
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
#pragma omp for nowait
for(size_t i=0; i<N/4; i++) {
count_private++;
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean_private = _mm_cvtsd_f64(t3)/4;
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
int count2 = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count2++;
mean2 += (data[i] - mean2)/count2;
}
mean = (4*(N/4)*mean + count2*mean2)/N;
return mean;
}
int main() {
const int N = 1001;
double data[N];
for(int i=0; i<N; i++) data[i] = 1.0*rand()/RAND_MAX;
float sum = 0; for(int i=0; i<N; i++) sum+= data[i]; sum/=N;
printf("mean %f\n", sum);
printf("mean_simd %f\n", mean_simd(data, N);
printf("mean_simd_omp %f\n", mean_simd_omp(data, N));
}
mean += data[i] / N
怎么样? - GHLmean
副本,其中mean.mean=0
和mean.count=0
。该线程循环遍历 N 数组元素的子集,执行mean += data[i]
,生成具有一些值的mean.mean
和处理的元素数量的mean.count
的mean
。该私有的mean
副本进入约简,其中使用两个RunningMean
对象的 += 运算符。即使在副本进入约简之前只有一个data
条目进入了mean
的私有副本,数学仍然可以正确地工作。 - chippies