将数组浮动到数组并快速返回

时间:2016-11-16 17:44:46

标签: c++ visual-c++ sse

我需要将内存中float的大数组转换为double的数组并返回。在Visual C ++ 15 update 3中是否有任何SSE编译器内在函数可以提供帮助?

编辑:这是两种线格式之间的转换,因此#define无济于事。数据结构存储为浮点数,但第三方处理库需要一个double数组。

3 个答案:

答案 0 :(得分:5)

您可以使用SSE:

float - > double_mm_cvtps_pd

double - > float_mm_cvtpd_ps

首先尝试一个简单的标量循环,因为(a)编译器可能会为你进行矢量化,并且(b)你很可能受内存限制,因此SIMD优化可能无济于事。

答案 1 :(得分:2)

这不是您问题的实际答案,而只是一个示例,说明如何只让ALU在转换时工作。如果正确实现,可以将其与FPU转换并行以获得更快的速度。该解决方案应该是100%IEEE兼容的。

更新:我使这个更慢,更易读,但IEEE兼容,因为intel在第3代i7中实现它(甚至NAN转换是二进制等)

#include <iostream>
#include <chrono>

#include <math.h>

void toDouble(float *inData, double *outData, int count)
{
    if (count % 2)
    {
        std::cout << "Error count must be divided by 2" << std::endl;
        return;
    }

    unsigned long long *pfData = (unsigned long long *)(inData);
    unsigned long long *pdData = (unsigned long long *)(outData);

    unsigned long long *pfDataEnd = pfData + count / 2;

    for (int i = 0; pfData<pfDataEnd; pfData++, pdData++, i += 2)
    {
        unsigned long long cl;

        unsigned long long S1 = (*pfData & 0x80000000ull) << 32;
        unsigned long long fE1 = (*pfData & 0x7F800000ull) << 32;
        unsigned long long F1 = (*pfData & 0x007FFFFFull) << 29;

        for (cl = 0; !fE1 && F1 && !(F1 & 0x7FF0000000000000ull); cl++)
            F1 <<= 1;
        if (cl > 0)
            cl--;

        unsigned long long dE1 = (fE1 == 0x7F80000000000000ull) ? 0x7FF0000000000000 : ((fE1 | F1) ? (fE1 >> 3) + 0x3800000000000000ull - cl * 0x0010000000000000ull : 0ull);

        F1 &= 0x000FFFFFFFFFFFFFull;

        *pdData = S1 | dE1 | F1;

        pdData++;

        unsigned long long S2 = *pfData & 0x8000000000000000ull;
        unsigned long long fE2 = (*pfData & 0x7F80000000000000ull);
        unsigned long long F2 = (*pfData & 0x007FFFFF00000000ull) >> 3;

        for (cl = 0; !fE2 && F2 && !(F2 & 0x7FF0000000000000ull); cl++)
            F2 <<= 1;
        if (cl > 0)
            cl--;

        unsigned long long dE2 = (fE2==0x7F80000000000000ull) ? 0x7FF0000000000000 : ( (fE2 | F2) ? (fE2 >> 3) + 0x3800000000000000ull - cl * 0x0010000000000000ull : 0ull);

        F2 &= 0x000FFFFFFFFFFFFFull;

        *pdData = S2 | dE2 | F2;

        if (i == 126)
            continue;
    }
}

void toFloat(double *inData, float *outData, int count)
{
    if (count % 2)
    {
        std::cout << "Error count must be divided by 2" << std::endl;
        return;
    }

    unsigned long long *pdData = (unsigned long long *)(inData);
    unsigned long long *pfData = (unsigned long long *)(outData);

    unsigned long long *pfDataEnd = pfData + count / 2;

    for (int i=0; pfData<pfDataEnd; pfData++, pdData+=2,i+=2)
    {
        unsigned long long S1 = (*pdData & 0x8000000000000000ull);
        unsigned long long dE1 = (*pdData & 0x7FF0000000000000ull);
        unsigned long long fE1 = (dE1 <= 0x3800000000000000ull) ? 0ull : ((dE1 >= 0x4800000000000000ull) ? 0x0FF0000000000000ull : (dE1 - 0x3800000000000000ull));
        unsigned long long F1 = (dE1 <= 0x3800000000000000ull) ? ((dE1 < 0x3600000000000000ull) ? 0ull : ((*pdData & 0x000FFFFFFFFFFFFFull | 0x0010000000000000ull) >> ((0x3800000000000000ull - dE1 >> 52) + 1))) : ((dE1 >= 0x47F0000000000000ull) ? (((dE1 == 0x7FF0000000000000ull) && (*pdData & 0x000FFFFFFFFFFFFFull)) ? 0x0008000000000000ull : 0ull) : (*pdData & 0x000FFFFFFFFFFFFFull));
        F1 += (((F1 & 0x0000000010000000ull) && ((F1 & 0x0000000020000000ull) || (F1 & 0x000000000FFFFFFFull))) ? 0x0000000020000000ull : 0ull); //rounding
        fE1 += F1 & 0x7FF0000000000000ull;
        F1 &= 0x000FFFFFE0000000ull;

        unsigned long long S2 = (*(pdData+1) & 0x8000000000000000ull);
        unsigned long long dE2 = (*(pdData+1) & 0x7FF0000000000000ull);
        unsigned long long fE2 =  ( dE2 <= 0x3800000000000000ull) ? 0ull : ((dE2 >= 0x4800000000000000ull) ? 0x0FF0000000000000ull : (dE2 - 0x3800000000000000ull));
        unsigned long long F2 = (dE2 <= 0x3800000000000000ull) ? ((dE2 < 0x3600000000000000ull) ? 0ull : ((*(pdData + 1) & 0x000FFFFFFFFFFFFFull | 0x0010000000000000ull) >> ((0x3800000000000000ull - dE2 >> 52) + 1))) : ((dE2 >= 0x47F0000000000000ull) ? (((dE2 == 0x7FF0000000000000ull) && (*(pdData+1) & 0x000FFFFFFFFFFFFFull)) ? 0x0008000000000000ull : 0ull) : (*(pdData + 1) & 0x000FFFFFFFFFFFFFull));

        F2 += (((F2 & 0x0000000010000000ull) && ((F2 & 0x0000000020000000ull) || (F2 & 0x000000000FFFFFFFull))) ? 0x0000000020000000ull : 0ull); //rounding
        fE2 += F2 & 0x7FF0000000000000ull;
        F2 &= 0x000FFFFFE0000000ull;

        *pfData = S2 | ((fE2 | F2) << 3) | ((S1 | ((fE1 | F1) << 3)) >> 32);

        if (i == 88)
            continue;

    }
}

int valTestFtoD(float *inData, double *outData, int count)
{
    for (int i = 0; i < count; i++)
    {
        if ((((double)inData[i]) != outData[i]) && ((inData[i] == inData[i]) || (outData[i] == outData[i])))
            return i;
    }
    return -1;
}

int valTestDtoF(double *inData, float*outData, int count)
{
    for (int i = 0; i < count; i++)
    {
        if ((((float)inData[i]) != outData[i]) && ((inData[i] == inData[i]) || (outData[i] == outData[i])))
            return i;
    }
    return -1;
}

void testFloatToDouble()
{
    std::cout << "\nSTART Float to Double TEST\n";
    int elemNum = 1024 * 1024 * 8;
    float *f_arr = new float[elemNum];
    double *d_arr = new double[elemNum];

    auto start = std::chrono::steady_clock::now();
    f_arr[0] = 2.0f;
    for (int i = 1; i < elemNum; i++)
    {
        f_arr[i] = i / f_arr[i - 1];
        d_arr[i] = 0.0f;
    }
    long long duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "init of floats and doubles done in " << duration << std::endl;

    start = std::chrono::steady_clock::now();
    for (int i = 0; i < elemNum; i++)
    {
        d_arr[i] = f_arr[i];
    }
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "cast to double done in " << duration << std::endl;

    start = std::chrono::steady_clock::now();
    float pi = 3.14159265358979323846;
    float e = 2.71828182845904523536;
    f_arr[0] = pi;
    d_arr[0] = 0.0;
    for (int i = 1; i < elemNum; i++)
    {
        f_arr[i] = (e + i) / f_arr[i - 1];
        d_arr[i] = 0.0;
    }
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "init of floats and doubles done in " << duration << std::endl;

    start = std::chrono::steady_clock::now();
    toDouble(f_arr, d_arr, elemNum);
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "toDouble done in " << duration << std::endl;

    std::cout << "toDouble validation test ";
    int errorPos = valTestFtoD(f_arr, d_arr, elemNum);
    if (errorPos < 0)
        std::cout << "OK" << std::endl;
    else
    {
        std::cout << "FAIL at " << errorPos << std::endl;
        std::cout << "float [" << errorPos << "]= " << f_arr[errorPos] << std::endl;
        std::cout << "double[" << errorPos << "]= " << d_arr[errorPos] << std::endl;
    }

    delete[] f_arr;
    delete[] d_arr;

    std::cout << "END TEST\n";
}

void testDoubleToFloat()
{
    std::cout << "\nSTART Double to Float TEST\n";
    int elemNum = 1024 *1024 * 8;
    float *f_arr = new float[elemNum];
    double *d_arr = new double[elemNum];

    auto start = std::chrono::steady_clock::now();
    d_arr[0] = 2.0f;
    for (int i = 1; i < elemNum; i++)
    {
        d_arr[i] = i / d_arr[i - 1];
        f_arr[i] = 0.0f;
    }
    long long duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "init of floats and doubles done in " << duration << std::endl;

    start = std::chrono::steady_clock::now();
    for (int i = 0; i < elemNum; i++)
    {
        f_arr[i] = (float)d_arr[i];
    }
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "cast to float done in " << duration << std::endl;

    start = std::chrono::steady_clock::now();

    double pi = 3.14159265358979323846;
    double e = 2.71828182845904523536;  

    d_arr[0] = pi;      
    f_arr[0] = 0.0f;
    for (int i = 1; i < elemNum; i++)
    {       
        d_arr[i] = (e+i) / d_arr[i - 1];

        f_arr[i] = 0.0f;
    }



    duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "init of floats and doubles done in " << duration << std::endl;

    start = std::chrono::steady_clock::now();
    toFloat(d_arr, f_arr, elemNum);
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
    std::cout << "toFloat done in " << duration << std::endl;

    std::cout << "toFloat validation test ";
    int errorPos = valTestDtoF(d_arr, f_arr, elemNum);
    if (errorPos < 0)
        std::cout << "OK" << std::endl;
    else
    {
        std::cout << "FAIL at " << errorPos << std::endl;           
        std::cout << "double[" << errorPos << "]= " << d_arr[errorPos] << std::endl;
        std::cout << "float[" << errorPos << "]= " << f_arr[errorPos] << std::endl;
    }

    delete[] f_arr;
    delete[] d_arr;

    std::cout << "END TEST\n";
}

int main()
{
    testFloatToDouble();
    testDoubleToFloat();
}

online example

答案 2 :(得分:1)

  

数据结构存储为浮点数,但第三方处理库需要一个double数组。

它可以在缓存大小的块中处理吗?

如果它没有卡在第三方库中,最好的方法是动态转换,从_mm_cvtps_pd的一对浮动中加载一对双打,同样存储回浮点数,所以你永远不会在内存中有一个double数组。

但是如果你不能这样做,你可以至少将数据提供给库,而在读取一些浮点数并写一些双打后,它仍然在L1或L2缓存中很热。

实际上,如果它是“有线格式”,那么大概数据必须首先通过CPU到内存的路上,除非你有一个零拷贝接收API,DMA直接进入你的缓冲区。当您收到每个数据包时,转换的理想位置可能是小块。如果您还需要原始的double数据,请将其转换为直接转换为float,或复制到doublefloat数组。