我有一个关于错误共享并行编程的问题。
源代码在底部。
在这里总结我的问题:
如果定义USE_REGISTER,
性能统计-e L1-dcache-load -e L1-dcache-load-misses ./falseSharing
和
#define PADNUM 1
结果是
12,0461,8669 L1-dcache-load 2861,8883 L1-dcache-load-misses # 2.38% of all L1-dcache hit
exe时间是
真实0m0.683s
然后我将PADNUM乘以1乘以
直到PADNUM 为7为止,exe时间几乎保持不变。
当PADNUM> = 7时,加速度如X2所预期的那样,
真实0m0.305s
,此时缓存未命中为
12,0115,2333 L1-dcache-load 6,9266 L1-dcache-load-misses # 0.01% of all L1-dcache hits
如果按预期方式应用USE_REGISTER,则某些变量将存储在寄存器中,因此L1-dcache-load的数量将下降到〜7,0000,0000。
我的问题是:
为什么缓存未命中只有〜2〜3%?凭直觉我猜这是大约50%,因为2个线程执行交织。
为什么垫子只有7个?因为DUMP_TID说,例如:
addr(x)= 0x602200,addr(y)= 0x602220,padding的大小= 28,填充开始 在0x602204,sizeof(f)= 36
这意味着在我的计算机中,填充为28个字节,sizeof float为4个字节,因此对齐方式似乎是32个字节,而不是64个字节。
但是,cat cpuinfo表示缓存行与64个字节对齐:
cache_alignment:64
#include <stdio.h>
#include <stdlib.h>
#include <thread>
#include <iostream>
#include <sstream>
#include <mutex>
#ifdef USE_REGISTER
#define REGISTER register
#else
#define REGISTER
#endif
using namespace std;
#define PADNUM 1
#define REGISTER register
struct fooPad {
float x;
float pad[PADNUM];
float y;
};
fooPad f;
/* The two following functions are running concurrently: */
std::mutex g_display_mutex;
#define DUMP_TID() \
do{ \
g_display_mutex.lock(); \
std::thread::id this_id = std::this_thread::get_id(); \
std::cout << __FUNCTION__ << "() thread = " << this_id << endl; \
g_display_mutex.unlock(); \
} while(0)
float sum_a(void)
{
DUMP_TID();
REGISTER float s = 0;
REGISTER float i;
for (i = 0; i < 10000000; ++i)
s += f.x;
return s;
}
float inc_b(void)
{
DUMP_TID();
REGISTER float i;
for (i = 0; i < 10000000; ++i){
f.y += 1.0;
}
return f.y;
}
int main()
{
float a = 0;
float b = 0;
f.x = 1.0;
f.y = 1.5;
printf("addr(x) = %p, addr(y) = %p, sizeof padding = %ld, pad start at %p, sizeof(f) = %ld\n",
&(f.x),
&(f.y),
sizeof(f.pad),
&(f.pad[0]), sizeof(f) );
for (int i = 0; i < 10; i++) {
#pragma omp parallel num_threads(2)
#pragma omp single
{
#pragma omp task
//a = sum_a();
a = sum_a();
#pragma omp task
//b = inc_b();
b = inc_b();
#pragma omp taskwait
}
}
printf("a = %f, b = %f\n", a, b);
return 1;
}
/*
int main()
{
arr[0] = 1; arr[1] = 2;
float a = 0;
float b = 0;
f.x = 1.0;
f.y = 1.5;
printf("size %ld %ld %ld\n", sizeof(f.x),sizeof(f.y), sizeof(f) );
printf("size %ld %ld %ld\n", sizeof(f1.x),sizeof(f1.y), sizeof(f1) );
printf("size %ld %ld %ld\n", sizeof(f2.x),sizeof(f2.y), sizeof(f2) );
for (int i = 0; i < 10; i++) {
#pragma omp parallel num_threads(1)
#pragma omp single
{
#pragma omp task
a = sum_a();
#pragma omp task
b = inc_b();
#pragma omp taskwait
}
}
printf("a = %f, b = %f\n", a, b);
return 1;
}
*/