Question

我有以下代码：

#pragma pack(4)
struct RECORD_HEADER {
uint64_t msgType;
uint64_t rdtsc;
};
struct BODY {
    char content[488];
};
#pragma pack()

class SerializedRDTSC {
public:
    typedef unsigned long long timeunit_t;

    static timeunit_t start(void) {
            unsigned cycles_high, cycles_low;
            __asm__ __volatile__ (  "CPUID\n\t"
                                    "RDTSC\n\t"
                                    "mov %%edx, %0\n\t"
                                    "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
                                    "%rax", "%rbx", "%rcx", "%rdx");
            return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
    }

    static timeunit_t end(void) {
            unsigned cycles_high, cycles_low;
            __asm__ __volatile__(   "RDTSCP\n\t"
                                    "mov %%edx, %0\n\t"
                                    "mov %%eax, %1\n\t"
                                    "CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
                                    "%rbx", "%rcx", "%rdx");
            return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
    }

};

char* createSHM() noexcept {
        const auto sharedMemHandle = shm_open("testing", O_RDWR | O_CREAT, 0666);
        if (-1 == sharedMemHandle) {
            std::cout << "failed to open named shared memory: " << std::endl;
            return nullptr;
        }
        constexpr int32_t size = (1 << 26);
        ftruncate(sharedMemHandle, size);
        char* ptr = (char*) mmap(nullptr, size, PROT_READ | PROT_WRITE,
                MAP_SHARED, sharedMemHandle, 0);

        if (MAP_FAILED == ptr) {
            std::cout << errno << std::endl;
            return nullptr;
        }

        const auto rc = fchmod(sharedMemHandle, 0666);
        if (rc == -1) {
            fprintf(stderr,
                    "Can't change permissions to 0666 on shared mem segment: %m\n");
            fflush(stderr);
        }
        return ptr;
}

int main() {
    BODY update;

    srand(time(nullptr));
    char* ptr = createSHM();

    constexpr uint64_t n = 700;
    constexpr uint64_t n2 = 10;
    uint64_t m_data[n * n2];
    memset(m_data, 0, sizeof(m_data));

    uint64_t r = 0;

    for (uint64_t i = 0; i < n; i++) {
        for (uint64_t k = 0; k < n2; k++) {
            // populate the header
            const auto msgType = rand();
            const auto rdtsc = rand();

            // populate the struct randomly
            uint32_t* tmp = reinterpret_cast<uint32_t*>(&update);
            for (uint32_t j = 0; j < sizeof(BODY) / sizeof(uint32_t); j++) {
                const uint32_t v = rand() % 32767;
                tmp[j] = v;
            }

            // write the struct
            const auto s = SerializedRDTSC::start();
            memcpy(ptr, (char*)&msgType, sizeof(uint64_t));
            ptr+= sizeof(uint64_t);
            memcpy(ptr, (char*)&rdtsc, sizeof(uint64_t));
            ptr+= sizeof(uint64_t);
            memcpy(ptr, &update, sizeof(BODY));
            ptr+= sizeof(BODY);
            const auto e = SerializedRDTSC::end();
            m_data[r++] = e - s;
        }
        usleep(249998);
    }

    for (uint32_t i = 0; i < r; i++) {
        std::cout << i << "," << m_data[i] << std::endl;
    }
}

由于某些原因，根据输出有周期性延迟峰值：

我已经隔离了核心并用htop进行了双重检查，以确保没有其他进程使用核心。

我的机器有一个i7 CPU（没什么特别的）。

然后我尝试使用Xeon CPU。模式大致相同 - 每7-11写一次，就有一个峰值。

使用i7 CPU，我用GCC 7.2编译c ++ 17并在CentOS 7.3上运行。

使用Xeon CPU，我用GCC 4.6编译c ++ 0x并在CentOS 6.5上运行。

我的问题是： 1.为什么会出现周期性延迟峰值？（我用strace检查过。而且我没有看到奇怪的系统调用） 2.有关如何调查/了解穗的任何建议？更多我的学习。

提前致谢！

P.S。是的，有些人反对使用rdtsc来测量延迟，因为温度会影响TSC。所以，我没有看到任何更好的选择，因为我没有PTP，而clock_gettime（）有时也会有延迟峰值。如果您有任何建议，非常欢迎：）

Answer 1

内存页面为4K字节。每次开始在新页面上书写时，该页面都需要映射到进程地址空间。由于您在每个循环中写入的数据是8 + 8 + 488 = 504字节，因此每8或9次循环就会得到一个峰值。

由于CPU可以推测性地从内存中预取数据，因此第二页的页面错误（应该发生在第8个循环上）比硬件预取器尝试访问页面时提前一个循环发生。

在Linux上写入共享内存时的周期性延迟峰值

1 个答案: