Question

我对SSE的使用有一个非常奇怪的问题。

我编写了以下函数，其中我使用SSE来计算两个浮点数组的差异的最大值，每个浮点数组包含64个浮点数。

dists-array是通过_aligned_malloc分配的二维数组。

#include <iostream>
#include <xmmintrin.h>
#include <time.h>
#include <stdio.h>
#include <algorithm>
#include <fstream>

#include "hr_time.h"

using namespace std;

float** dists;
float** dists2;
__m128* a;
__m128* b;
__m128* c;
__m128* d;
__m128 diff;
__m128 diff2;
__m128 mymax;
float* myfmax;

float test(int s, int t)
{
    a = (__m128*) dists[s];
    b = (__m128*) dists[t];
    c = (__m128*) dists2[s];
    d = (__m128*) dists2[t];

    diff;
    mymax = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
    for (int i = 0; i <= 16; i++)
    {
        diff = _mm_sub_ps(*a, *b);
        mymax = _mm_max_ps(diff, mymax);

        diff2 = _mm_sub_ps(*d, *c);
        mymax = _mm_max_ps(diff2, mymax);

        a++;
        b++;
        c++;
        d++;
    }

    _mm_store_ps(myfmax, mymax);
    float res = max(max(max(myfmax[0], myfmax[1]), myfmax[2]), myfmax[3]);
    return res;
}

int Deserialize(std::istream* stream)
{
    int numOfElements, arraySize;

    stream->read((char*)&numOfElements, sizeof(int)); // numOfElements = 64
    stream->read((char*)&arraySize, sizeof(int)); // arraySize = 8000000 

    dists = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
    dists2 = (float**)_aligned_malloc(arraySize * sizeof(float*), 16);
    for (int j = 0; j < arraySize; j++)
    {
        dists[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
        dists2[j] = (float*)_aligned_malloc(numOfElements * sizeof(float), 16);
    }

    for (int i = 0; i < arraySize; i++)
    {
        stream->read((char*)dists[i], (numOfElements*sizeof(float)));
    }

    for (int i = 0; i < arraySize; i++)
    {
        stream->read((char*)dists2[i], (numOfElements*sizeof(float)));
    }

    return 0;
}

int main(int argc, char** argv)
{
    int entries = 8000000;

    myfmax = (float*)_aligned_malloc(4 * sizeof(float), 16);
    ifstream fs("binary_file", std::ios::binary);
    Deserialize(&fs);

    CStopWatch* watch = new CStopWatch();
    watch->StartTimer();
    int i;
    for (i = 0; i < entries; i++)
    {
        int s = rand() % entries;
        int t = rand() % entries;
        test(s, t);
    }
    watch->StopTimer();
    cout << i << " iterations took " << watch->GetElapsedTimeMs() << "ms" << endl;

    cin.get();
}

我的问题是，如果我在带有附加调试器的Visual Studio中运行它，这段代码运行得非常快。但是只要我在没有调试器的情况下执行它就会变得很慢。所以我做了一点研究，发现这两种启动方法之间的一个区别就是“调试堆”。所以我通过定义“_NO_DEBUG_HEAP = 1”来禁用它。使用该选项，我也可以通过附加的调试器获得非常差的性能。

但我不明白如何使用Debug Heap获得更好的性能？我不知道如何解决这个问题，所以我希望你们中的一个人可以帮助我。

提前致谢。

此致卡斯滕

Answer 1

您的代码有错误。 _mm_store_ps存储一个包含四个浮点数的数组，但您只声明一个浮点数。编译器甚至不允许你这样做。

更改

float fmax;
_mm_store_ps(fmax, max);
pi = std::max(std::max(std::max(fmax[0], fmax[1]), fmax[2]), fmax[3]);

到

float __declspec(align(16)) fmax[4];
_mm_store_ps(fmax, max);
return std::max(std::max(std::max(fmax[0], fmax[1]), fmax[2]), fmax[3]);

在没有调试器的情况下启动时，Visual C ++ SSE功能会变慢

1 个答案: