Question

根据规范，C中的函数rand（）使用互斥锁来锁定上下文（http://sourcecodebrowser.com/uclibc/0.9.27/rand_8c.html）。因此，如果我使用多个调用它的线程，我的程序将会很慢，因为所有线程都会尝试访问此锁定区域。

所以，我找到了另一个随机数生成器函数drand48（），它没有锁（http://sourcecodebrowser.com/uclibc/0.9.27/drand48_8c.html#af9329f9acef07ca14ea2256191c3ce74）。但是，不知何故，我的并行程序仍比串行程序慢！代码粘贴在下面：

串行版：

#include <cstdlib>

#define M 100000000

int main()
{
    for (int i = 0; i < M; ++i)
        drand48(); 
    return 0;
}

并行版本：

#include <pthread.h>
#include <cstdlib>

#define M 100000000
#define N 4

pthread_t threads[N];

void* f(void* p)
{
    for (int i = 0; i < M/N; ++i)
        drand48();
}

int main()
{
    for (int i = 0; i < N; ++i)
            pthread_create(&threads[i], NULL, f, NULL);
    for (int i = 0; i < N; ++i)
            pthread_join(threads[i], NULL);
    return 0;
}

我执行了两个代码。序列号在~0.6秒内运行，并行在~2.1秒内运行。

有人能解释我为什么会这样吗？

其他一些信息：我的电脑上有4个核心。我使用

编译串行版本

g ++ serial.cpp -o serial

和并行使用

g ++ parallel.cpp -lpthread -o parallel

编辑：

显然，每当我更新线程中的全局变量时，就会发生性能损失。在下面的例子中，x变量是全局的（请注意，在并行示例中，操作将是非线程安全的）：

串行：

#include <cstdlib>

#define M 1000000000

int x = 0;

int main()
{
    for (int i = 0; i < M; ++i)
        x = x + 10 - 10;
    return 0;
}

并行：

#include <pthread.h>
#include <cstdlib>

#define M 1000000000
#define N 4

pthread_t threads[N];
int x;

void* f(void* p)
{
    for (int i = 0; i < M/N; ++i)
        x = x + 10 - 10;
}

int main()
{
    for (int i = 0; i < N; ++i)
        pthread_create(&threads[i], NULL, f, NULL);
    for (int i = 0; i < N; ++i)
        pthread_join(threads[i], NULL);
    return 0;
}

请注意，drand48（）使用全局结构变量 _libc_drand48_data 。

Answer 1

drand48()使用全局结构变量_libc_drand48_data，它在那里保持状态（写入它），因此是缓存行争用的来源，这很可能是性能下降的根源。它不是我最初怀疑的false sharing并在评论中写道，它是真正的分享。在drand48（）的实现中没有锁定的原因有两个：

drand48（）是not required to be thread-safe“drand48（），lrand48（）和mrand48（）函数不需要是线程安全的。”

如果两个线程碰巧同时访问它，并且它们对内存的写入是交错的，则没有任何损害 - 数据结构没有损坏，毕竟它应该返回伪随机数据。 / LI>

当一个线程正在初始化状态时，使用drand48（）时会有一些微妙的考虑因素（竞争条件），but considered harmless

下面注意__drand48_iterate它如何存储全局变量中的三个16位字，这是随机生成器保持其状态的地方，这是线程之间缓存线争用的来源

xsubi[0] = result & 0xffff;
xsubi[1] = (result >> 16) & 0xffff;
xsubi[2] = (result >> 32) & 0xffff;

源代码

您提供了drand48() source code的链接，我在下面提供了该链接以供参考。问题是状态更新时的缓存行争用

#include <stdlib.h>

/* Global state for non-reentrant functions.  Defined in drand48-iter.c.  */

extern struct drand48_data __libc_drand48_data;

double drand48(void)    
{
    double result;
    erand48_r (__libc_drand48_data.__x, &__libc_drand48_data, &result);
    return result;
}

以下是erand48_r

的来源

extern int __drand48_iterate(unsigned short xsubi[3], struct drand48_data *buffer);

int erand48_r (xsubi, buffer, result)
      unsigned short int xsubi[3];
      struct drand48_data *buffer;
      double *result;
{
    union ieee754_double temp;

    /* Compute next state.  */
    if (__drand48_iterate (xsubi, buffer) < 0)
        return -1;

    /* Construct a positive double with the 48 random bits distributed over
       its fractional part so the resulting FP number is [0.0,1.0).  */

    temp.ieee.negative = 0;
    temp.ieee.exponent = IEEE754_DOUBLE_BIAS;
    temp.ieee.mantissa0 = (xsubi[2] << 4) | (xsubi[1] >> 12);
    temp.ieee.mantissa1 = ((xsubi[1] & 0xfff) << 20) | (xsubi[0] << 4);

    /* Please note the lower 4 bits of mantissa1 are always 0.  */
    *result = temp.d - 1.0;

    return 0;
}

__drand48_iterate的实现，它写回全局

int
__drand48_iterate (unsigned short int xsubi[3], struct drand48_data *buffer)
{
    uint64_t X;
    uint64_t result;

    /* Initialize buffer, if not yet done.  */
    if (unlikely(!buffer->__init))
    {
        buffer->__a = 0x5deece66dull;
        buffer->__c = 0xb;
        buffer->__init = 1;
    }

    /* Do the real work.  We choose a data type which contains at least
       48 bits.  Because we compute the modulus it does not care how
       many bits really are computed.  */

    X = (uint64_t) xsubi[2] << 32 | (uint32_t) xsubi[1] << 16 | xsubi[0];

    result = X * buffer->__a + buffer->__c;

    xsubi[0] = result & 0xffff;
    xsubi[1] = (result >> 16) & 0xffff;
    xsubi[2] = (result >> 32) & 0xffff;

    return 0;
}

pthreads和drand48并发性能

1 个答案:

源代码