英特尔VTune结果理解 - 天真的问题

时间:2016-11-04 17:04:17

标签: performance vtune

我的应用程序我想加速执行大数组(大约1e8个元素)的元素处理。

每个元素的处理过程非常简单,我怀疑瓶颈可能不是CPU而是DRAM带宽。 所以我决定先研究单线程版本。

系统是:Windows 10 64位,32 GB RAM,Intel Core i7-3770S Ivybridge 1.10 GHz 4核,启用超线程

并发分析

Elapsed Time:   34.425s
    CPU Time:   14.908s
        Effective Time: 14.908s
            Idle:   0.005s
            Poor:   14.902s
            Ok: 0s
            Ideal:  0s
            Over:   0s
        Spin Time:  0s
        Overhead Time:  0s
    Wait Time:  0.000s
        Idle:   0.000s
        Poor:   0s
        Ok: 0s
        Ideal:  0s
        Over:   0s
    Total Thread Count: 2
    Paused Time:    18.767s

内存访问分析

内存访问分析为相同数量的数据连续三次运行提供不同的CPU时间 正如并发分析所说,实际执行时间约为23秒。

Elapsed Time:   33.526s
    CPU Time:   5.740s
    Memory Bound:   38.3%
        L1 Bound:   10.4%
        L2 Bound:   0.0%
        L3 Bound:   0.1%
        DRAM Bound: 0.8%
            Memory Bandwidth:   36.1%
            Memory Latency: 60.4%
    Loads:  12,912,960,000
    Stores: 7,720,800,000
    LLC Miss Count: 420,000
    Average Latency (cycles):   15
    Total Thread Count: 4
    Paused Time:    18.081s

Elapsed Time:   33.011s
    CPU Time:   4.501s
    Memory Bound:   36.9%
        L1 Bound:   10.6%
        L2 Bound:   0.0%
        L3 Bound:   0.2%
        DRAM Bound: 0.6%
            Memory Bandwidth:   36.5%
            Memory Latency: 62.7%
    Loads:  9,836,100,000
    Stores: 5,876,400,000
    LLC Miss Count: 180,000
    Average Latency (cycles):   15
    Total Thread Count: 4
    Paused Time:    17.913s

Elapsed Time:   33.738s
    CPU Time:   5.999s
    Memory Bound:   38.5%
        L1 Bound:   10.8%
        L2 Bound:   0.0%
        L3 Bound:   0.1%
        DRAM Bound: 0.9%
            Memory Bandwidth:   57.8%
            Memory Latency: 37.3%
    Loads:  13,592,760,000
    Stores: 8,125,200,000
    LLC Miss Count: 660,000
    Average Latency (cycles):   15
    Total Thread Count: 4
    Paused Time:    18.228s

据我所知,摘要页面的情况并不是很好。

论文Finding your Memory Access performance bottlenecks说,原因是所谓的虚假分享。但我不使用多线程,所有处理都只由一个线程执行。

另一方面,根据内存访问分析/平台页面,DRAM带宽不是瓶颈。

所以问题是

  1. 为什么CPU时间度量值对于并发分析和内存访问分析而言是不同的
  2. 内存指标值不好的原因是什么,特别是对于L1绑定?
  3. 主循环是lambda函数,其中

    • tasklets:包含数据处理系数的简单结构的std :: vector
    • 要点:数据本身,Eigen :: Matrix
    • 投影:Eigen :: Matrix,用于将处理结果放入
    • 的数组

    代码是:

    #include <iostream>
    #include <future>
    #include <random>
    
    #include <Eigen/Dense>
    
    #include <ittnotify.h>
    
    using namespace std;
    
    using Vector3 = Eigen::Matrix<float, 3, 1>;
    using Matrix3X = Eigen::Matrix<float, 3, Eigen::Dynamic>;
    
    uniform_real_distribution<float> rnd(0.1f, 100.f);
    default_random_engine gen;
    
    class Tasklet {
    public:
        Tasklet(int p1, int p2)
            :
            p1Id(p1), p2Id(p2), Loc0(p1)
        {
            RestDistance = rnd(gen);
            Weight_2 = rnd(gen);
        }
        __forceinline void solve(const Matrix3X& q, Matrix3X& p)
        {
            Vector3 q1 = q.col(p1Id);
            Vector3 q2 = q.col(p2Id);
            for (int i = 0; i < 0; ++i) {
                Vector3 delta = q2 - q1;
                float norm = delta.blueNorm() * delta.hypotNorm();
            }
            Vector3 deltaQ = q2 - q1;
            float dist = deltaQ.norm();
            Vector3 deltaUnitVector = deltaQ / dist;
            p.col(Loc0) = deltaUnitVector * RestDistance * Weight_2;
        }
    
        int p1Id;
        int p2Id;
        int Loc0;
        float RestDistance;
        float Weight_2;
    };
    
    typedef vector<Tasklet*> TaskList;
    
    void
    runTest(const Matrix3X& points, Matrix3X& projections, TaskList& tasklets)
    {
        size_t num = tasklets.size();
        for (size_t i = 0; i < num; ++i) {
            Tasklet* t = tasklets[i];
            t->solve(points, projections);
        }
    }
    
    void
    prepareData(Matrix3X& points, Matrix3X& projections, int numPoints, TaskList& tasklets)
    {
        points.resize(3, numPoints);
        projections.resize(3, numPoints);
        points.setRandom();
        /*
        for (int i = 0; i < numPoints; ++i) {
        points.col(i) = Vector3(1, 0, 0);
        }
        */
        tasklets.reserve(numPoints - 1);
        for (int i = 1; i < numPoints; ++i) {
            tasklets.push_back(new Tasklet(i - 1, i));
        }
    
    }
    
    int
    main(int argc, const char** argv)
    {
        // Pause VTune data collection
        __itt_pause();
        cout << "Usage: <exefile> <number of points (in thousands)> <#runs for averaging>" << endl;
    
        int numPoints = 150 * 1000;
        int numRuns = 1;
        int argNo = 1;
    
        if (argc > argNo) {
            istringstream in(argv[argNo]);
            int i;
            in >> i;
            if (in) {
                numPoints = i * 1000;
            }
        }
        ++argNo;
        if (argc > argNo) {
            istringstream in(argv[argNo]);
            int i;
            in >> i;
            if (in) {
                numRuns = i;
            }
        }
        cout
            << "Running test" << endl
            << "\t NumPoints (thousands): " << numPoints / 1000. << endl
            << "\t # of runs for averaging: " << numRuns << endl;
    
        Matrix3X q, projections;
        TaskList tasklets;
    
        cout << "Preparing test data" << endl;
    
        prepareData(q, projections, numPoints, tasklets);
    
        cout << "Running test" << endl;
    
        // Resume VTune data collection
        __itt_resume();
        for (int r = 0; r < numRuns; ++r) {
            runTest(q, projections, tasklets);
        }
        // Pause VTune data collection
        __itt_pause();
    
        for (auto* t : tasklets) {
            delete t;
        }
    
        return 0;
    }
    

    谢谢。

0 个答案:

没有答案