所以我试图用C.测量L1,L2,L3缓存的延迟。我知道它们的大小,我觉得我从概念上理解如何做到这一点,但我遇到了我的实现问题。我想知道是否有一些其他硬件错综复杂如预取问题。
#include <time.h>
#include <stdio.h>
#include <string.h>
int main(){
srand(time(NULL)); // Seed ONCE
const int L1_CACHE_SIZE = 32768/sizeof(int);
const int L2_CACHE_SIZE = 262144/sizeof(int);
const int L3_CACHE_SIZE = 6587392/sizeof(int);
const int NUM_ACCESSES = 1000000;
const int SECONDS_PER_NS = 1000000000;
int arrayAccess[L1_CACHE_SIZE];
int arrayInvalidateL1[L1_CACHE_SIZE];
int arrayInvalidateL2[L2_CACHE_SIZE];
int arrayInvalidateL3[L3_CACHE_SIZE];
int count=0;
int index=0;
int i=0;
struct timespec startAccess, endAccess;
double mainMemAccess, L1Access, L2Access, L3Access;
int readValue=0;
memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));
index = 0;
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
while (index < L1_CACHE_SIZE) {
int tmp = arrayAccess[index]; //Access Value from L2
index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
count++; //divide overall time by this
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
mainMemAccess /= count;
printf("Main Memory Access %lf\n", mainMemAccess);
index = 0;
count=0;
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
while (index < L1_CACHE_SIZE) {
int tmp = arrayAccess[index]; //Access Value from L2
index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
count++; //divide overall time by this
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
L1Access /= count;
printf("L1 Cache Access %lf\n", L1Access);
//invalidate L1 by accessing all elements of array which is larger than cache
for(count=0; count < L1_CACHE_SIZE; count++){
int read = arrayInvalidateL1[count];
read++;
readValue+=read;
}
index = 0;
count = 0;
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
while (index < L1_CACHE_SIZE) {
int tmp = arrayAccess[index]; //Access Value from L2
index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
count++; //divide overall time by this
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
L2Access /= count;
printf("L2 Cache Acces %lf\n", L2Access);
//invalidate L2 by accessing all elements of array which is larger than cache
for(count=0; count < L2_CACHE_SIZE; count++){
int read = arrayInvalidateL2[count];
read++;
readValue+=read;
}
index = 0;
count=0;
clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
while (index < L1_CACHE_SIZE) {
int tmp = arrayAccess[index]; //Access Value from L2
index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
count++; //divide overall time by this
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
L3Access /= count;
printf("L3 Cache Access %lf\n", L3Access);
printf("Read Value: %d", readValue);
}
我首先访问我想要数据的数组中的值。这应该显然来自主内存,因为它是第一次访问。该阵列很小(小于页面大小),因此应该将其复制到L1,L2,L3中。我从相同的数组访问值,现在应该是L1。然后,我从与L1高速缓存相同大小的数组中访问所有值,以使我想要访问的数据无效(所以现在它应该只在L2 / 3中)。然后我重复L2和L3的这个过程。访问时间显然是关闭的,这意味着我做错了...
我认为时钟可能会出现问题(启动和停止需要花费一些时间在ns中,并且当它们被缓存/取消时它会发生变化)
有人可以给我一些指示我可能做错的事吗?
UPDATE1:所以我通过进行大量访问来分摊计时器的成本,我修改了我的缓存的大小,我也采取了建议来制定更复杂的索引方案以避免固定的步幅。不幸的是,时代仍未结束。他们似乎都在为L1而来。我认为问题可能是无效而不是访问。随机vs LRU方案是否会影响被无效的数据?
UPDATE2:修复了memset(添加L3 memset以使L3中的数据无效以及首次访问从主内存开始)和索引方案,仍然没有运气。
UPDATE3:我无法使用这种方法,但有一些很好的建议答案,我发布了一些我自己的解决方案。
我还运行了Cachegrind来查看命中/未命中
==6710== I refs: 1,735,104
==6710== I1 misses: 1,092
==6710== LLi misses: 1,084
==6710== I1 miss rate: 0.06%
==6710== LLi miss rate: 0.06%
==6710==
==6710== D refs: 1,250,696 (721,162 rd + 529,534 wr)
==6710== D1 misses: 116,492 ( 7,627 rd + 108,865 wr)
==6710== LLd misses: 115,102 ( 6,414 rd + 108,688 wr)
==6710== D1 miss rate: 9.3% ( 1.0% + 20.5% )
==6710== LLd miss rate: 9.2% ( 0.8% + 20.5% )
==6710==
==6710== LL refs: 117,584 ( 8,719 rd + 108,865 wr)
==6710== LL misses: 116,186 ( 7,498 rd + 108,688 wr)
==6710== LL miss rate: 3.8% ( 0.3% + 20.5% )
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
. . . . . . . . . #include <time.h>
. . . . . . . . . #include <stdio.h>
. . . . . . . . . #include <string.h>
. . . . . . . . .
6 0 0 0 0 0 2 0 0 int main(){
5 1 1 0 0 0 2 0 0 srand(time(NULL)); // Seed ONCE
1 0 0 0 0 0 1 0 0 const int L1_CACHE_SIZE = 32768/sizeof(int);
1 0 0 0 0 0 1 0 0 const int L2_CACHE_SIZE = 262144/sizeof(int);
1 0 0 0 0 0 1 0 0 const int L3_CACHE_SIZE = 6587392/sizeof(int);
1 0 0 0 0 0 1 0 0 const int NUM_ACCESSES = 1000000;
1 0 0 0 0 0 1 0 0 const int SECONDS_PER_NS = 1000000000;
21 2 2 3 0 0 3 0 0 int arrayAccess[L1_CACHE_SIZE];
21 1 1 3 0 0 3 0 0 int arrayInvalidateL1[L1_CACHE_SIZE];
21 2 2 3 0 0 3 0 0 int arrayInvalidateL2[L2_CACHE_SIZE];
21 1 1 3 0 0 3 0 0 int arrayInvalidateL3[L3_CACHE_SIZE];
1 0 0 0 0 0 1 0 0 int count=0;
1 1 1 0 0 0 1 0 0 int index=0;
1 0 0 0 0 0 1 0 0 int i=0;
. . . . . . . . . struct timespec startAccess, endAccess;
. . . . . . . . . double mainMemAccess, L1Access, L2Access, L3Access;
1 0 0 0 0 0 1 0 0 int readValue=0;
. . . . . . . . .
7 0 0 2 0 0 1 1 1 memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
7 1 1 2 2 0 1 0 0 memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
7 0 0 2 2 0 1 0 0 memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
7 1 1 2 2 0 1 0 0 memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));
. . . . . . . . .
1 0 0 0 0 0 1 1 1 index = 0;
4 0 0 0 0 0 1 0 0 clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
772 1 1 514 0 0 0 0 0 while (index < L1_CACHE_SIZE) {
1,280 1 1 768 257 257 256 0 0 int tmp = arrayAccess[index]; //Access Value from L2
2,688 0 0 768 0 0 256 0 0 index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
256 0 0 256 0 0 0 0 0 count++; //divide overall time by this
. . . . . . . . . }
4 0 0 0 0 0 1 0 0 clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
14 1 1 5 1 1 1 1 1 mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
6 0 0 2 0 0 1 0 0 mainMemAccess /= count;
. . . . . . . . .
6 1 1 2 0 0 2 0 0 printf("Main Memory Access %lf\n", mainMemAccess);
. . . . . . . . .
1 0 0 0 0 0 1 0 0 index = 0;
1 0 0 0 0 0 1 0 0 count=0;
4 1 1 0 0 0 1 0 0 clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
772 1 1 514 0 0 0 0 0 while (index < L1_CACHE_SIZE) {
1,280 0 0 768 240 0 256 0 0 int tmp = arrayAccess[index]; //Access Value from L2
2,688 0 0 768 0 0 256 0 0 index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
256 0 0 256 0 0 0 0 0 count++; //divide overall time by this
. . . . . . . . . }
4 0 0 0 0 0 1 0 0 clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
14 1 1 5 0 0 1 1 0 L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
6 1 1 2 0 0 1 0 0 L1Access /= count;
. . . . . . . . .
6 0 0 2 0 0 2 0 0 printf("L1 Cache Access %lf\n", L1Access);
. . . . . . . . .
. . . . . . . . . //invalidate L1 by accessing all elements of array which is larger than cache
32,773 1 1 24,578 0 0 1 0 0 for(count=0; count < L1_CACHE_SIZE; count++){
40,960 0 0 24,576 513 513 8,192 0 0 int read = arrayInvalidateL1[count];
8,192 0 0 8,192 0 0 0 0 0 read++;
16,384 0 0 16,384 0 0 0 0 0 readValue+=read;
. . . . . . . . . }
. . . . . . . . .
1 0 0 0 0 0 1 0 0 index = 0;
1 1 1 0 0 0 1 0 0 count = 0;
4 0 0 0 0 0 1 1 0 clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
772 1 1 514 0 0 0 0 0 while (index < L1_CACHE_SIZE) {
1,280 0 0 768 256 0 256 0 0 int tmp = arrayAccess[index]; //Access Value from L2
2,688 0 0 768 0 0 256 0 0 index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
256 0 0 256 0 0 0 0 0 count++; //divide overall time by this
. . . . . . . . . }
4 1 1 0 0 0 1 0 0 clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
14 0 0 5 1 0 1 1 0 L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
6 1 1 2 0 0 1 0 0 L2Access /= count;
. . . . . . . . .
6 0 0 2 0 0 2 0 0 printf("L2 Cache Acces %lf\n", L2Access);
. . . . . . . . .
. . . . . . . . . //invalidate L2 by accessing all elements of array which is larger than cache
262,149 2 2 196,610 0 0 1 0 0 for(count=0; count < L2_CACHE_SIZE; count++){
327,680 0 0 196,608 4,097 4,095 65,536 0 0 int read = arrayInvalidateL2[count];
65,536 0 0 65,536 0 0 0 0 0 read++;
131,072 0 0 131,072 0 0 0 0 0 readValue+=read;
. . . . . . . . . }
. . . . . . . . .
1 0 0 0 0 0 1 0 0 index = 0;
1 0 0 0 0 0 1 0 0 count=0;
4 0 0 0 0 0 1 1 0 clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
772 1 1 514 0 0 0 0 0 while (index < L1_CACHE_SIZE) {
1,280 0 0 768 256 0 256 0 0 int tmp = arrayAccess[index]; //Access Value from L2
2,688 0 0 768 0 0 256 0 0 index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
256 0 0 256 0 0 0 0 0 count++; //divide overall time by this
. . . . . . . . . }
4 0 0 0 0 0 1 0 0 clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
14 1 1 5 1 0 1 1 0 L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
6 0 0 2 0 0 1 0 0 L3Access /= count;
. . . . . . . . .
6 1 1 2 0 0 2 0 0 printf("L3 Cache Access %lf\n", L3Access);
. . . . . . . . .
6 0 0 1 0 0 1 0 0 printf("Read Value: %d", readValue);
. . . . . . . . .
3 0 0 3 0 0 0 0 0 }
答案 0 :(得分:26)
我宁愿尝试使用硬件时钟作为衡量标准。 rdtsc
指令将告诉您自CPU启动以来的当前周期数。此外,最好使用asm
来确保在测量和干运行中始终使用相同的指令。使用这个以及我很久以前做过的一些聪明的统计数据:
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>
int i386_cpuid_caches (size_t * data_caches) {
int i;
int num_data_caches = 0;
for (i = 0; i < 32; i++) {
// Variables to hold the contents of the 4 i386 legacy registers
uint32_t eax, ebx, ecx, edx;
eax = 4; // get cache info
ecx = i; // cache id
asm (
"cpuid" // call i386 cpuid instruction
: "+a" (eax) // contains the cpuid command code, 4 for cache query
, "=b" (ebx)
, "+c" (ecx) // contains the cache id
, "=d" (edx)
); // generates output in 4 registers eax, ebx, ecx and edx
// taken from http://download.intel.com/products/processor/manual/325462.pdf Vol. 2A 3-149
int cache_type = eax & 0x1F;
if (cache_type == 0) // end of valid cache identifiers
break;
char * cache_type_string;
switch (cache_type) {
case 1: cache_type_string = "Data Cache"; break;
case 2: cache_type_string = "Instruction Cache"; break;
case 3: cache_type_string = "Unified Cache"; break;
default: cache_type_string = "Unknown Type Cache"; break;
}
int cache_level = (eax >>= 5) & 0x7;
int cache_is_self_initializing = (eax >>= 3) & 0x1; // does not need SW initialization
int cache_is_fully_associative = (eax >>= 1) & 0x1;
// taken from http://download.intel.com/products/processor/manual/325462.pdf 3-166 Vol. 2A
// ebx contains 3 integers of 10, 10 and 12 bits respectively
unsigned int cache_sets = ecx + 1;
unsigned int cache_coherency_line_size = (ebx & 0xFFF) + 1;
unsigned int cache_physical_line_partitions = ((ebx >>= 12) & 0x3FF) + 1;
unsigned int cache_ways_of_associativity = ((ebx >>= 10) & 0x3FF) + 1;
// Total cache size is the product
size_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions * cache_coherency_line_size * cache_sets;
if (cache_type == 1 || cache_type == 3) {
data_caches[num_data_caches++] = cache_total_size;
}
printf(
"Cache ID %d:\n"
"- Level: %d\n"
"- Type: %s\n"
"- Sets: %d\n"
"- System Coherency Line Size: %d bytes\n"
"- Physical Line partitions: %d\n"
"- Ways of associativity: %d\n"
"- Total Size: %zu bytes (%zu kb)\n"
"- Is fully associative: %s\n"
"- Is Self Initializing: %s\n"
"\n"
, i
, cache_level
, cache_type_string
, cache_sets
, cache_coherency_line_size
, cache_physical_line_partitions
, cache_ways_of_associativity
, cache_total_size, cache_total_size >> 10
, cache_is_fully_associative ? "true" : "false"
, cache_is_self_initializing ? "true" : "false"
);
}
return num_data_caches;
}
int test_cache(size_t attempts, size_t lower_cache_size, int * latencies, size_t max_latency) {
int fd = open("/dev/urandom", O_RDONLY);
if (fd < 0) {
perror("open");
abort();
}
char * random_data = mmap(
NULL
, lower_cache_size
, PROT_READ | PROT_WRITE
, MAP_PRIVATE | MAP_ANON // | MAP_POPULATE
, -1
, 0
); // get some random data
if (random_data == MAP_FAILED) {
perror("mmap");
abort();
}
size_t i;
for (i = 0; i < lower_cache_size; i += sysconf(_SC_PAGESIZE)) {
random_data[i] = 1;
}
int64_t random_offset = 0;
while (attempts--) {
// use processor clock timer for exact measurement
random_offset += rand();
random_offset %= lower_cache_size;
int32_t cycles_used, edx, temp1, temp2;
asm (
"mfence\n\t" // memory fence
"rdtsc\n\t" // get cpu cycle count
"mov %%edx, %2\n\t"
"mov %%eax, %3\n\t"
"mfence\n\t" // memory fence
"mov %4, %%al\n\t" // load data
"mfence\n\t"
"rdtsc\n\t"
"sub %2, %%edx\n\t" // substract cycle count
"sbb %3, %%eax" // substract cycle count
: "=a" (cycles_used)
, "=d" (edx)
, "=r" (temp1)
, "=r" (temp2)
: "m" (random_data[random_offset])
);
// printf("%d\n", cycles_used);
if (cycles_used < max_latency)
latencies[cycles_used]++;
else
latencies[max_latency - 1]++;
}
munmap(random_data, lower_cache_size);
return 0;
}
int main() {
size_t cache_sizes[32];
int num_data_caches = i386_cpuid_caches(cache_sizes);
int latencies[0x400];
memset(latencies, 0, sizeof(latencies));
int empty_cycles = 0;
int i;
int attempts = 1000000;
for (i = 0; i < attempts; i++) { // measure how much overhead we have for counting cyscles
int32_t cycles_used, edx, temp1, temp2;
asm (
"mfence\n\t" // memory fence
"rdtsc\n\t" // get cpu cycle count
"mov %%edx, %2\n\t"
"mov %%eax, %3\n\t"
"mfence\n\t" // memory fence
"mfence\n\t"
"rdtsc\n\t"
"sub %2, %%edx\n\t" // substract cycle count
"sbb %3, %%eax" // substract cycle count
: "=a" (cycles_used)
, "=d" (edx)
, "=r" (temp1)
, "=r" (temp2)
:
);
if (cycles_used < sizeof(latencies) / sizeof(*latencies))
latencies[cycles_used]++;
else
latencies[sizeof(latencies) / sizeof(*latencies) - 1]++;
}
{
int j;
size_t sum = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum += latencies[j];
}
size_t sum2 = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum2 += latencies[j];
if (sum2 >= sum * .75) {
empty_cycles = j;
fprintf(stderr, "Empty counting takes %d cycles\n", empty_cycles);
break;
}
}
}
for (i = 0; i < num_data_caches; i++) {
test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));
int j;
size_t sum = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum += latencies[j];
}
size_t sum2 = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum2 += latencies[j];
if (sum2 >= sum * .75) {
fprintf(stderr, "Cache ID %i has latency %d cycles\n", i, j - empty_cycles);
break;
}
}
}
return 0;
}
我的Core2Duo输出:
Cache ID 0:
- Level: 1
- Type: Data Cache
- Total Size: 32768 bytes (32 kb)
Cache ID 1:
- Level: 1
- Type: Instruction Cache
- Total Size: 32768 bytes (32 kb)
Cache ID 2:
- Level: 2
- Type: Unified Cache
- Total Size: 262144 bytes (256 kb)
Cache ID 3:
- Level: 3
- Type: Unified Cache
- Total Size: 3145728 bytes (3072 kb)
Empty counting takes 90 cycles
Cache ID 0 has latency 6 cycles
Cache ID 2 has latency 21 cycles
Cache ID 3 has latency 168 cycles
答案 1 :(得分:8)
好的,您的代码有几个问题:
正如您所说,您的测量需要很长时间。实际上,他们很可能比单一访问本身更长时间,所以他们并没有测量任何有用的东西。为了减轻这种影响,请访问多个元素并进行摊销(将总时间除以访问次数。请注意,要测量延迟,您希望将这些访问序列化,否则它们可以并行执行,您只能测量无关访问的吞吐量。要实现这一点,您只需在访问之间添加一个错误依赖。
例如,将数组初始化为零,然后执行:
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
for (int i = 0; i < NUM_ACCESSES; ++i) {
int tmp = arrayAccess[index]; //Access Value from Main Memory
index = (index + i + tmp) & 1023;
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
..当然记得把时间除以NUM_ACCESSES
现在,我已经使索引故意复杂化,以便你避免一个可能触发预取的固定步幅(有点矫枉过正,你不太可能注意到影响,但为了示范。 ..)。你可能会选择一个简单的index += 32
,它会给你128k(两个缓存行)的步伐,并避免&#34;好处&#34;最简单的相邻线/简单流预取器。我已将% 1000
替换为& 1023
,因为&
更快,但需要2的幂才能以相同的方式工作 - 所以只需增加ACCESS_SIZE
到1024,它应该工作。
通过加载其他东西来使L1失效是好的,但尺寸看起来很有趣。您没有指定系统,但256000
对于L1来说似乎相当大。在许多常见的现代x86 CPU上,L2通常为256k,例如另请注意,256k 不是 256000
,而是256*1024=262144
。第二个尺寸也是如此:1M不是1024000
,而是1024*1024=1048576
。假设确实是你的L2大小(更可能是L3,但可能太小了)。
您的无效数组的类型为int
,因此每个元素都比单个字节长(最可能是4个字节,具体取决于系统)。您实际上使L1_CACHE_SIZE*sizeof(int)
个字节的值无效(L2失效循环也是如此)
memset
会收到以字节为单位的尺寸,尺码除以sizeof(int)
您的失效读取从不使用,可能会被优化。尝试将读数累积到某个值并最终打印出来,以避免这种可能性。
开头的memset也是访问数据,因此你的第一个循环是从L3访问数据(因为其他2个memset仍然有效地从L1 + L2中驱逐它,尽管只是部分归因于尺寸错误。
步幅可能太小,因此您可以两次访问同一个高速缓存行(L1命中)。通过添加32个元素(x4字节) - 即2个高速缓存行,确保它们足够分散,因此您也不会获得任何相邻的高速缓存行预取优势。
由于NUM_ACCESSES大于ACCESS_SIZE,因此您基本上会重复相同的元素,并且可能会为它们获得L1点击(因此平均时间偏移有利于L1访问延迟)。而是尝试使用L1大小,以便您只访问整个L1(跳过除外)一次。对于例如像这样 -
index = 0;
while (index < L1_CACHE_SIZE) {
int tmp = arrayAccess[index]; //Access Value from L2
index = (index + tmp + ((index & 4) ? 28 : 36)); // on average this should give 32 element skips, with changing strides
count++; //divide overall time by this
}
arrayAccess
增加到L1大小。
现在,通过上面的更改(或多或少),我得到这样的结果:
L1 Cache Access 7.812500
L2 Cache Acces 15.625000
L3 Cache Access 23.437500
这似乎有点长,但可能是因为它包含对算术运算的额外依赖
答案 2 :(得分:6)
广泛使用的缓存延迟经典测试是迭代链表。它适用于现代超标量/超流水线CPU,甚至可用于ARM Cortex-A9 +和Intel Core 2 / ix等无序内核。此方法由开源lmbench使用 - 在测试lat_mem_rd
(man page)和CPU-Z延迟测量工具中:http://cpuid.com/medias/files/softwares/misc/latency.zip(本机Windows二进制文件)
来自lmbench:https://github.com/foss-for-synopsys-dwc-arc-processors/lmbench/blob/master/src/lat_mem_rd.c
的lat_mem_rd测试来源主要测试是
#define ONE p = (char **)*p;
#define FIVE ONE ONE ONE ONE ONE
#define TEN FIVE FIVE
#define FIFTY TEN TEN TEN TEN TEN
#define HUNDRED FIFTY FIFTY
void
benchmark_loads(iter_t iterations, void *cookie)
{
struct mem_state* state = (struct mem_state*)cookie;
register char **p = (char**)state->p[0];
register size_t i;
register size_t count = state->len / (state->line * 100) + 1;
while (iterations-- > 0) {
for (i = 0; i < count; ++i) {
HUNDRED;
}
}
use_pointer((void *)p);
state->p[0] = (char*)p;
}
因此,在解密宏之后,我们做了很多线性操作,如:
p = (char**) *p; // (in intel syntax) == mov eax, [eax]
p = (char**) *p;
p = (char**) *p;
.... // 100 times total
p = (char**) *p;
通过内存填充指针,每个指向stride
个元素前进。
如手册页http://www.bitmover.com/lmbench/lat_mem_rd.8.html
所述基准测试运行为两个嵌套循环。外环是步幅大小。内循环是数组大小。对于每个数组大小,基准测试会创建指向一个指针的指针环。遍历数组是由
完成的
p = (char **)*p;
在for循环中的(for循环的顶部不重要;循环是展开的循环1000加载长)。在完成一百万次加载后,循环停止。 阵列的大小从512字节到(通常)8兆字节不等。对于小尺寸,缓存将产生影响,并且负载将更快。当绘制数据时,这变得更加明显。
关于POWER的示例的更详细描述可以从IBM的wiki获得:Untangling memory access measurements - lat_mem_rd - 作者:Jenifer Hopper 2013
lat_mem_rd测试(http://www.bitmover.com/lmbench/lat_mem_rd.8.html)有两个参数,一个数组大小(MB)和一个步幅大小。基准测试使用两个循环遍历数组,使用步幅作为增量,创建指向一个步幅的指针环。该测试测量内存读取延迟(以纳秒为单位)的内存大小范围。输出由两列组成:第一列是以MB为单位的数组大小(浮点值),第二列是数组所有点的加载延迟。绘制结果后,您可以清楚地看到整个内存层次结构的相对延迟,包括每个缓存级别的更快延迟以及主内存延迟。
PS:英特尔的论文(感谢Eldar Abusalimov)提供了运行lat_mem_rd的示例:ftp://download.intel.com/design/intarch/PAPERS/321074.pdf - 抱歉,正确的网址为http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-cache-latency-bandwidth-paper.pdf “测量缓存和内存延迟以及CPU到内存带宽 - 用于英特尔架构”作者:Joshua Ruggiero,自2008年12月起:
答案 3 :(得分:1)
不是真正的答案,但无论如何都要阅读其他答案和评论中已经提到的一些事情
就在前几天我回答这个问题:
关于L1/L2/.../L?/MEMORY
传输速率的衡量,请查看它以获得更好的问题起点
<强> [注释] 强>
我强烈建议您使用RDTSC指令进行时间测量
特别是 L1 ,因为其他任何事情都太慢了。不要忘记将进程关联性设置为单个 CPU ,因为所有核心都有自己的计数器,即使在相同的输入时钟上,它们的计数差别很大!!!
将可变时钟计算机的 CPU 时钟调整为最大值,如果仅使用32位部分(一秒内使用现代CPU溢出32位计数器),请不要忘记考虑RDTSC
溢出。对于时间计算,使用CPU时钟(测量它或使用注册表值)
t0 <- RDTSC
Sleep(250);
t1 <- RDTSC
CPU f=(t1-t0)<<2 [Hz]
将进程关联设置为单CPU
所有 CPU 核心通常都有自己的 L1,L2 缓存,所以在多任务操作系统上你可以测量令人困惑的东西,如果你不这样做这样做
执行图形输出(图表)
然后你会看到上面链接中发生的实际情况我发布了不少情节
使用操作系统提供的最高流程优先级
答案 4 :(得分:0)
对于那些感兴趣的人,我无法让我的第一个代码集工作,所以我尝试了几种替代方法,产生了不错的结果。
第一个使用链接列表,节点在连续的内存空间中分配了跨距字节。节点的解除引用减轻了预取器的有效性,并且在拉入多个高速缓存行的情况下,使得步幅非常大以避免高速缓存命中。随着分配列表的大小增加,它会跳转到缓存或内存结构,这将使其显示明显的延迟划分。
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
//MACROS
#define ONE iterate = (char**) *iterate;
#define FIVE ONE ONE ONE
#define TWOFIVE FIVE FIVE FIVE FIVE FIVE
#define HUNDO TWOFIVE TWOFIVE TWOFIVE TWOFIVE
//prototype
void allocateRandomArray(long double);
void accessArray(char *, long double, char**);
int main(){
//call the function for allocating arrays of increasing size in MB
allocateRandomArray(.00049);
allocateRandomArray(.00098);
allocateRandomArray(.00195);
allocateRandomArray(.00293);
allocateRandomArray(.00391);
allocateRandomArray(.00586);
allocateRandomArray(.00781);
allocateRandomArray(.01172);
allocateRandomArray(.01562);
allocateRandomArray(.02344);
allocateRandomArray(.03125);
allocateRandomArray(.04688);
allocateRandomArray(.0625);
allocateRandomArray(.09375);
allocateRandomArray(.125);
allocateRandomArray(.1875);
allocateRandomArray(.25);
allocateRandomArray(.375);
allocateRandomArray(.5);
allocateRandomArray(.75);
allocateRandomArray(1);
allocateRandomArray(1.5);
allocateRandomArray(2);
allocateRandomArray(3);
allocateRandomArray(4);
allocateRandomArray(6);
allocateRandomArray(8);
allocateRandomArray(12);
allocateRandomArray(16);
allocateRandomArray(24);
allocateRandomArray(32);
allocateRandomArray(48);
allocateRandomArray(64);
allocateRandomArray(96);
allocateRandomArray(128);
allocateRandomArray(192);
}
void allocateRandomArray(long double size){
int accessSize=(1024*1024*size); //array size in bytes
char * randomArray = malloc(accessSize*sizeof(char)); //allocate array of size allocate size
int counter;
int strideSize=4096; //step size
char ** head = (char **) randomArray; //start of linked list in contiguous memory
char ** iterate = head; //iterator for linked list
for(counter=0; counter < accessSize; counter+=strideSize){
(*iterate) = &randomArray[counter+strideSize]; //iterate through linked list, having each one point stride bytes forward
iterate+=(strideSize/sizeof(iterate)); //increment iterator stride bytes forward
}
*iterate = (char *) head; //set tailf to point to head
accessArray(randomArray, size, head);
free(randomArray);
}
void accessArray(char *cacheArray, long double size, char** head){
const long double NUM_ACCESSES = 1000000000/100; //number of accesses to linked list
const int SECONDS_PER_NS = 1000000000; //const for timer
FILE *fp = fopen("accessData.txt", "a"); //open file for writing data
int newIndex=0;
int counter=0;
int read=0;
struct timespec startAccess, endAccess; //struct for timer
long double accessTime = 0;
char ** iterate = head; //create iterator
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
for(counter=0; counter < NUM_ACCESSES; counter++){
HUNDO //macro subsitute 100 accesses to mitigate loop overhead
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
//calculate the time elapsed in ns per access
accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (100*NUM_ACCESSES);
fprintf(fp, "%Lf\t%Lf\n", accessTime, size); //print results to file
fclose(fp); //close file
}
这产生了最一致的结果,并且使用各种数组大小并绘制相应的延迟,可以非常清楚地区分不同的高速缓存大小。
下一个方法,如之前分配的增加大小的数组。但是,我没有使用链接列表进行内存访问,而是使用相应的数字填充每个索引,并随机地对数组进行随机控制。然后我使用这些索引在数组中随机跳转以进行访问,从而减轻预取器的影响。但是,当多个相邻的高速缓存行被拉入并碰巧被击中时,它的访问时间偶尔会有很大的偏差。
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
//prototype
void allocateRandomArray(long double);
void accessArray(int *, long int);
int main(){
srand(time(NULL)); // Seed random function
int i=0;
for(i=2; i < 32; i++){
allocateRandomArray(pow(2, i)); //call latency function on arrays of increasing size
}
}
void allocateRandomArray(long double size){
int accessSize = (size) / sizeof(int);
int * randomArray = malloc(accessSize*sizeof(int));
int counter;
for(counter=0; counter < accessSize; counter ++){
randomArray[counter] = counter;
}
for(counter=0; counter < accessSize; counter ++){
int i,j;
int swap;
i = rand() % accessSize;
j = rand() % accessSize;
swap = randomArray[i];
randomArray[i] = randomArray[j];
randomArray[j] = swap;
}
accessArray(randomArray, accessSize);
free(randomArray);
}
void accessArray(int *cacheArray, long int size){
const long double NUM_ACCESSES = 1000000000;
const int SECONDS_PER_NS = 1000000000;
int newIndex=0;
int counter=0;
int read=0;
struct timespec startAccess, endAccess;
long double accessTime = 0;
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
for(counter = 0; counter < NUM_ACCESSES; counter++){
newIndex=cacheArray[newIndex];
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
//calculate the time elapsed in ns per access
accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (NUM_ACCESSES);
printf("Access time: %Lf for size %ld\n", accessTime, size);
}
在许多试验中平均,这种方法也产生了相对准确的结果。第一种选择肯定是两者中较好的一种,但这也是一种可行的替代方法。