多线程的 random_r 比单线程版本慢

Question

多线程的 random_r 比单线程版本慢

10

以下程序与此处描述的程序基本相同。当我使用两个线程（NTHREADS == 2）运行并编译程序时，我得到以下运行时间：

real        0m14.120s
user        0m25.570s
sys         0m0.050s

当使用一个线程运行（NTHREADS == 1）时，即使只使用一个内核，我也会获得显着更好的运行时间。

real        0m4.705s
user        0m4.660s
sys         0m0.010s

我的系统是双核的，我知道random_r是线程安全的，并且我非常确定它是非阻塞的。当不使用random_r，而是使用余弦和正弦的计算作为替代时，双线程版本的运行时间比预期的时间要快约一半。

#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>

#define NTHREADS 2
#define PRNG_BUFSZ 8
#define ITERATIONS 1000000000

void* thread_run(void* arg) {
    int r1, i, totalIterations = ITERATIONS / NTHREADS;
    for (i = 0; i < totalIterations; i++){
        random_r((struct random_data*)arg, &r1);
    }
    printf("%i\n", r1);
}

int main(int argc, char** argv) {
    struct random_data* rand_states = (struct random_data*)calloc(NTHREADS, sizeof(struct random_data));
    char* rand_statebufs = (char*)calloc(NTHREADS, PRNG_BUFSZ);
    pthread_t* thread_ids;
    int t = 0;
    thread_ids = (pthread_t*)calloc(NTHREADS, sizeof(pthread_t));
    /* create threads */
    for (t = 0; t < NTHREADS; t++) {
        initstate_r(random(), &rand_statebufs[t], PRNG_BUFSZ, &rand_states[t]);
        pthread_create(&thread_ids[t], NULL, &thread_run, &rand_states[t]);
    }
    for (t = 0; t < NTHREADS; t++) {
        pthread_join(thread_ids[t], NULL);
    }
    free(thread_ids);
    free(rand_states);
    free(rand_statebufs);
}

我感到困惑的是，在生成随机数时，双线程版本的性能比单线程版本要差得多，考虑到 random_r 应该用于多线程应用程序。

- Nixuz

2个回答

1

我不知道这是否相关，但我刚刚看到了非常相似的行为（使用2个线程比使用1个线程慢一个数量级）... 我基本上改变了一个：

  srand(seed);
  foo = rand();

转换为a

  myseed = seed;
  foo = rand_r(&myseed);

这样“修复”了它（现在可靠地两倍速 - 例如19秒而不是35秒，有2个线程）。

我不知道问题可能是什么 - 可能是锁定或 rand() 内部的缓存一致性？无论如何，还有一个 random_r() ，也许您（一年前）或其他人会用到它。

- Jerry

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Pete Kirkham · Accepted Answer

一个非常简单的更改，可以将数据在内存中间隔开：

struct random_data* rand_states = (struct random_data*)calloc(NTHREADS * 64, sizeof(struct random_data));
char* rand_statebufs = (char*)calloc(NTHREADS*64, PRNG_BUFSZ);
pthread_t* thread_ids;
int t = 0;
thread_ids = (pthread_t*)calloc(NTHREADS, sizeof(pthread_t));
/* create threads */
for (t = 0; t < NTHREADS; t++) {
    initstate_r(random(), &rand_statebufs[t*64], PRNG_BUFSZ, &rand_states[t*64]);
    pthread_create(&thread_ids[t], NULL, &thread_run, &rand_states[t*64]);
}

在我的双核机器上运行时间大大加快。

这将证实一个猜想 - 你正在两个不同的线程中改变同一缓存行上的值，因此存在缓存争用。如果你还不知道这个问题，Herb Sutter的“机器架构 - 你的编程语言从未告诉过你”的演讲值得一看，他在1:20左右开始演示了伪共享。

计算出缓存行大小，并创建每个线程的数据，使其与之对齐。

将所有线程的数据放入一个结构体中，并进行对齐会更加整洁：

#define CACHE_LINE_SIZE 64

struct thread_data {
    struct random_data random_data;
    char statebuf[PRNG_BUFSZ];
    char padding[CACHE_LINE_SIZE - sizeof ( struct random_data )-PRNG_BUFSZ];
};

int main ( int argc, char** argv )
{
    printf ( "%zd\n", sizeof ( struct thread_data ) );

    void* apointer;

    if ( posix_memalign ( &apointer, sizeof ( struct thread_data ), NTHREADS * sizeof ( struct thread_data ) ) )
        exit ( 1 );

    struct thread_data* thread_states = apointer;

    memset ( apointer, 0, NTHREADS * sizeof ( struct thread_data ) );

    pthread_t* thread_ids;

    int t = 0;

    thread_ids = ( pthread_t* ) calloc ( NTHREADS, sizeof ( pthread_t ) );

    /* create threads */
    for ( t = 0; t < NTHREADS; t++ ) {
        initstate_r ( random(), thread_states[t].statebuf, PRNG_BUFSZ, &thread_states[t].random_data );
        pthread_create ( &thread_ids[t], NULL, &thread_run, &thread_states[t].random_data );
    }

    for ( t = 0; t < NTHREADS; t++ ) {
        pthread_join ( thread_ids[t], NULL );
    }

    free ( thread_ids );
    free ( thread_states );
}

使用 CACHE_LINE_SIZE 为 64：

refugio:$ gcc -O3 -o bin/nixuz_random_r src/nixuz_random_r.c -lpthread
refugio:$ time bin/nixuz_random_r 
64
63499495
944240966

real    0m1.278s
user    0m2.540s
sys 0m0.000s

或者您可以使用双倍缓存行大小，并使用malloc-额外的填充确保变异的内存位于单独的行上，因为malloc是16（如果我没记错）而不是64字节对齐。

（我将ITERATIONS减少了十倍，而不是拥有一个愚蠢的快速机器）