Question

我知道作为提高应用程序性能的一般技术，建议设计一个结构，以便非共享成员（跨线程）位于不同的缓存行上。这样就不会有false sharing，每个线程的内存访问会更有效。

我想测试天气这是真的，所以我通过改变我的豚鼠结构进行了以下小测试。

struct SharedSlow {
  int a_;  
  int b_;   
  int c_;
  int d_;
};

struct SharedQuick {
  int a_ __attribute__((aligned(32)));
  int d_ __attribute__((aligned(32)));
  int b_ __attribute__((aligned(32)));
  int c_ __attribute__((aligned(32)));
};

将测试数据结构切换为'SharedQuick'之后，线程'writer_b'将在成员b_ and c_上读/写，这些成员现在位于a_ and d_所在的缓存行上（线程'writer_a'需要关心这些最后2）。

我得到的结果非常糟糕：与所有成员位于同一缓存行的情况相比，性能最差。

使用time

衡量

// Shared Slow
real    0m2.063s
user    0m4.102s
sys     0m0.001s

// Shared Quick
real    0m11.328s
user    0m22.420s
sys     0m0.002s

用perf

衡量

// Shared Slow
16k cycles
16  cache-misses

// Shared Quick
89k cycles
88  cache-misses

2个线程在一个单独的cpu上，所以它们应该独立运行：你知道这种情况恶化的原因是什么？

我可以看到，与第一个数据结构相比，CPUS正在做更多的工作，但我不了解他们在那个时间忙什么或者这些周期花费的地方：你有什么建议吗？ / p>

#define NTIMES  100000
#define ARRSIZE 10000


void* writer_a(void* args)
{
  cpu_set_t cpu;
  CPU_ZERO(&cpu);
  CPU_SET(0, &cpu);
  pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu);

  Shared* p = (Shared*)args;

  for(int j=0;j<NTIMES; j++){
  for(int i=0;i<ARRSIZE;i++){
    p[i].a_ = j;
    p[i].a_ += j;
    p[i].d_ = p[i].a_;
    p[i].d_ += j;
  }
  }
  return 0;
}

void* writer_b(void* args)
{
  cpu_set_t cpu;
  CPU_ZERO(&cpu);
  CPU_SET(15, &cpu);
  pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu);      

  Shared* p = (Shared*)args;

  for(int j=0;j<NTIMES; j++){
  for(int i=0;i<ARRSIZE;i++){
    p[i].b_ = j;
    p[i].b_ += j;
    p[i].c_ = p[i].b_;
    p[i].c_ += j;
  }
  }
}

//typedef SharedSlow Shared;
typedef SharedQuick  Shared;  // this goes very slow

int main()
{
  pthread_t first, second;
  int ret = 0;

  Shared* p = new Shared[ARRSIZE];
  ret = pthread_create(&first,NULL, writer_a, p);
  if(ret != 0){
    cout << "err" << __LINE__ << endl;
    exit(EXIT_FAILURE);
  }
  ret = pthread_create(&second,NULL, writer_b, p);
  if(ret !=0){
    cout << "err" << __LINE__ << endl;
    exit(EXIT_FAILURE);
  }

  void *r1, *r2;
  pthread_join(first,  &r1);
  pthread_join(second, &r2);        
}

在单独的缓存行上隔离非共享数据

0 个答案: