Question

我目前正在优化 OpenCL 代码以提高性能，我确定以下方法是性能瓶颈，我想用等效的 OpenCL-SIMD-Replacement 替换它：

typedef unsigned long uint64_t;

void multiply256(const unsigned int x[8], const unsigned int y[8], unsigned int out_high[8], unsigned int out_low[8])
{
    unsigned int z[16];
    unsigned int high = 0;
    uint64_t product = 0;

    // First round, overwrite z
    for(int j = 7; j >= 0; j--) {

        product = (uint64_t)x[7] * y[j] + high;

        z[7 + j + 1] = (unsigned int)product;
        high = (unsigned int)(product >> 32);
    }
    z[7] = high;

    for(int i = 6; i >= 0; i--) {

        high = 0;

        for(int j = 7; j >= 0; j--) {

            product = (uint64_t)x[i] * y[j] + z[i + j + 1] + high;

            z[i + j + 1] = (unsigned int)product;

            high = product >> 32;
        }

        z[i] = high;
    }

    for(int i = 0; i < 8; i++) {
        out_high[i] = z[i];
        out_low[i] = z[8 + i];
    }
}

所以我想我可以像这样替换它：

void multiply256(const unsigned int x[8], const unsigned int y[8], unsigned int out_high[8], unsigned int out_low[8])
{
    uint8 x8;
    x8[0] = x[0];
    x8[1] = x[1];
    x8[2] = x[2];
    x8[3] = x[3];
    x8[4] = x[4];
    x8[5] = x[5];
    x8[6] = x[6];
    x8[7] = x[7];
    uint8 y8;
    y8[0] = y[0];
    y8[1] = y[1];
    y8[2] = y[2];
    y8[3] = y[3];
    y8[4] = y[4];
    y8[5] = y[5];
    y8[6] = y[6];
    y8[7] = y[7];

    uint8 high = mul_hi(x8, y8);
    uint8 low = x8 * y8;

    out_high[0] = high[0];
    out_high[1] = high[1];
    out_high[2] = high[2];
    out_high[3] = high[3];
    out_high[4] = high[4];
    out_high[5] = high[5];
    out_high[6] = high[6];
    out_high[7] = high[7];

    out_low[0] = low[0];
    out_low[1] = low[1];
    out_low[2] = low[2];
    out_low[3] = low[3];
    out_low[4] = low[4];
    out_low[5] = low[5];
    out_low[6] = low[6];
    out_low[7] = low[7];
}

但它没有相同的结果。我做错了什么？

谢谢

将原生 C 矩阵乘法转换为 OpenCl SIMD 矩阵乘法

0 个答案: