我的应用程序我想加速执行大数组(大约1e8个元素)的元素处理。
每个元素的处理过程非常简单,我怀疑瓶颈可能不是CPU而是DRAM带宽。 所以我决定先研究单线程版本。
系统是:Windows 10 64位,32 GB RAM,Intel Core i7-3770S Ivybridge 1.10 GHz 4核,启用超线程
Elapsed Time: 34.425s
CPU Time: 14.908s
Effective Time: 14.908s
Idle: 0.005s
Poor: 14.902s
Ok: 0s
Ideal: 0s
Over: 0s
Spin Time: 0s
Overhead Time: 0s
Wait Time: 0.000s
Idle: 0.000s
Poor: 0s
Ok: 0s
Ideal: 0s
Over: 0s
Total Thread Count: 2
Paused Time: 18.767s
内存访问分析为相同数量的数据连续三次运行提供不同的CPU时间 正如并发分析所说,实际执行时间约为23秒。
Elapsed Time: 33.526s
CPU Time: 5.740s
Memory Bound: 38.3%
L1 Bound: 10.4%
L2 Bound: 0.0%
L3 Bound: 0.1%
DRAM Bound: 0.8%
Memory Bandwidth: 36.1%
Memory Latency: 60.4%
Loads: 12,912,960,000
Stores: 7,720,800,000
LLC Miss Count: 420,000
Average Latency (cycles): 15
Total Thread Count: 4
Paused Time: 18.081s
Elapsed Time: 33.011s
CPU Time: 4.501s
Memory Bound: 36.9%
L1 Bound: 10.6%
L2 Bound: 0.0%
L3 Bound: 0.2%
DRAM Bound: 0.6%
Memory Bandwidth: 36.5%
Memory Latency: 62.7%
Loads: 9,836,100,000
Stores: 5,876,400,000
LLC Miss Count: 180,000
Average Latency (cycles): 15
Total Thread Count: 4
Paused Time: 17.913s
Elapsed Time: 33.738s
CPU Time: 5.999s
Memory Bound: 38.5%
L1 Bound: 10.8%
L2 Bound: 0.0%
L3 Bound: 0.1%
DRAM Bound: 0.9%
Memory Bandwidth: 57.8%
Memory Latency: 37.3%
Loads: 13,592,760,000
Stores: 8,125,200,000
LLC Miss Count: 660,000
Average Latency (cycles): 15
Total Thread Count: 4
Paused Time: 18.228s
据我所知,摘要页面的情况并不是很好。
论文Finding your Memory Access performance bottlenecks说,原因是所谓的虚假分享。但我不使用多线程,所有处理都只由一个线程执行。
另一方面,根据内存访问分析/平台页面,DRAM带宽不是瓶颈。
所以问题是
主循环是lambda函数,其中
代码是:
#include <iostream>
#include <future>
#include <random>
#include <Eigen/Dense>
#include <ittnotify.h>
using namespace std;
using Vector3 = Eigen::Matrix<float, 3, 1>;
using Matrix3X = Eigen::Matrix<float, 3, Eigen::Dynamic>;
uniform_real_distribution<float> rnd(0.1f, 100.f);
default_random_engine gen;
class Tasklet {
public:
Tasklet(int p1, int p2)
:
p1Id(p1), p2Id(p2), Loc0(p1)
{
RestDistance = rnd(gen);
Weight_2 = rnd(gen);
}
__forceinline void solve(const Matrix3X& q, Matrix3X& p)
{
Vector3 q1 = q.col(p1Id);
Vector3 q2 = q.col(p2Id);
for (int i = 0; i < 0; ++i) {
Vector3 delta = q2 - q1;
float norm = delta.blueNorm() * delta.hypotNorm();
}
Vector3 deltaQ = q2 - q1;
float dist = deltaQ.norm();
Vector3 deltaUnitVector = deltaQ / dist;
p.col(Loc0) = deltaUnitVector * RestDistance * Weight_2;
}
int p1Id;
int p2Id;
int Loc0;
float RestDistance;
float Weight_2;
};
typedef vector<Tasklet*> TaskList;
void
runTest(const Matrix3X& points, Matrix3X& projections, TaskList& tasklets)
{
size_t num = tasklets.size();
for (size_t i = 0; i < num; ++i) {
Tasklet* t = tasklets[i];
t->solve(points, projections);
}
}
void
prepareData(Matrix3X& points, Matrix3X& projections, int numPoints, TaskList& tasklets)
{
points.resize(3, numPoints);
projections.resize(3, numPoints);
points.setRandom();
/*
for (int i = 0; i < numPoints; ++i) {
points.col(i) = Vector3(1, 0, 0);
}
*/
tasklets.reserve(numPoints - 1);
for (int i = 1; i < numPoints; ++i) {
tasklets.push_back(new Tasklet(i - 1, i));
}
}
int
main(int argc, const char** argv)
{
// Pause VTune data collection
__itt_pause();
cout << "Usage: <exefile> <number of points (in thousands)> <#runs for averaging>" << endl;
int numPoints = 150 * 1000;
int numRuns = 1;
int argNo = 1;
if (argc > argNo) {
istringstream in(argv[argNo]);
int i;
in >> i;
if (in) {
numPoints = i * 1000;
}
}
++argNo;
if (argc > argNo) {
istringstream in(argv[argNo]);
int i;
in >> i;
if (in) {
numRuns = i;
}
}
cout
<< "Running test" << endl
<< "\t NumPoints (thousands): " << numPoints / 1000. << endl
<< "\t # of runs for averaging: " << numRuns << endl;
Matrix3X q, projections;
TaskList tasklets;
cout << "Preparing test data" << endl;
prepareData(q, projections, numPoints, tasklets);
cout << "Running test" << endl;
// Resume VTune data collection
__itt_resume();
for (int r = 0; r < numRuns; ++r) {
runTest(q, projections, tasklets);
}
// Pause VTune data collection
__itt_pause();
for (auto* t : tasklets) {
delete t;
}
return 0;
}
谢谢。