今天我学习了c ++ amp的基础知识,目的是将它用于我正在编写的插件。该插件需要进行大量的边缘与三角交叉测试,所以我编写了这个测试代码,看看它是否可行。
处理和数据量的功能是真实的,数据本身只是随机的东西。
令我失望的是,AMP版本的运行速度比未经优化的cpu版本快〜3倍(我试图保持功能尽可能相似)。这个问题似乎是我写入结果视图的部分(尽管如此,我怀疑通过评论整个着色器会被编译器剪掉)。
无论如何,我有一个优化的cpu版本,运行速度比测试代码中的cpu版本快20倍,但仍然太慢。 AMP版本有没有希望赶上并超越它?
代码:
#include <cstdio>
#include <iostream>
#include <sys/timeb.h>
#include <math.h>
#include <vector>
#include <amp.h>
#include <amp_math.h>
#include <random>
using namespace concurrency;
struct point3 {
float a, b, c;
point3() {
a = rand() % 100;
b = rand() % 100;
c = rand() % 100;
}
};
struct thing {
point3 center;
float radius;
};
int lastTick;
timeb tb;
int tick() {
int hold = lastTick;
ftime(&tb);
lastTick = tb.millitm + (tb.time & 0xfffff) * 1000;
return lastTick - hold;
}
int main()
{
system("PAUSE");
std::vector<thing> faces(13500);
std::vector<thing> edges(20000);
std::vector<int> results(faces.size() * edges.size());
array_view<const thing> v1(faces.size(), faces);
array_view<const thing> v2(edges.size(), edges);
array_view<int> vr(results.size(), results);
int nFaces = faces.size();
int nEdges = edges.size();
tick();
for (int x = 0; x < nFaces; x++) {
thing* face = &faces[x];
int a, b, c;
thing* edge;
for (int y = 0; y < nEdges; y++) {
edge = &edges[y];
a = edge->center.a - face->center.a;
b = edge->center.b - face->center.b;
c = edge->center.c - face->center.c;
results[x * nFaces + y] = (sqrt(a * a + b * b + c * c) < (face->radius + edge->radius)) ? 1 : 0;
}
}
std::cout << "cpu time: " << tick() << "\n";
v1.refresh();
v2.refresh();
parallel_for_each(v1.extent, [=](index<1> x) restrict(amp) {
const thing* face = &v1[x];
int a, b, c;
const thing* edge;
for (int y = 0; y < nEdges; y++) {
edge = &v2[y];
a = edge->center.a - face->center.a;
b = edge->center.b - face->center.b;
c = edge->center.c - face->center.c;
vr[x * nFaces + y] = (fast_math::sqrt(a * a + b * b + c * c) < (face->radius + edge->radius)) ? 1 : 0;
}
});
vr.synchronize();
std::cout << "gpu time: " << tick() << "\n";
system("PAUSE");
}