Question

为了对信号进行下采样，我使用了FIR滤波器+抽取级（实际上是跨步卷积）。过滤和抽取相结合的最大优势是降低了计算成本（减少了抽取因子）。

使用简单直接的OpenCL实现，我无法从抽取中受益。恰恰相反：抽取因子为4的卷积比完整卷积慢25％。

内核代码：

__kernel void decimation(__constant float *input,
                         __global   float *output,
                         __constant float *coefs,
                         const int taps,
                         const int decimationFactor) {

    int posOutput = get_global_id(0);
    float result = 0;

    for (int tap=0; tap<taps; tap++) {
        int posInput = (posOutput * decimationFactor) - tap;
        result += input[posInput] * coefs[tap];
    }

    output[posOutput] = result;
}

我想这是由于内存访问不协调所致。虽然我想不出解决此问题的解决方案。有什么想法吗？

编辑：我尝试了Dithermaster的解决方案，将问题分解为合并的读取到共享的本地内存，并从本地内存进行卷积：

__kernel void decimation(__constant float *input,
                        __global   float *output,
                        __constant float *coefs,
                        const int taps,
                        const int decimationFactor,
                        const int bufferSize,
                        __local float *localInput) {

    const int posOutput = get_global_id(0);
    const int localSize = get_local_size(0);
    const int localId   = get_local_id(0);
    const int groupId   = get_group_id(0);

    const int localInputOffset  = taps-1;
    const int localInputOverlap = taps-decimationFactor;
    const int localInputSize    = localInputOffset + localSize * decimationFactor;

    // 1. transfer global input data to local memory
    // read global input to local input (only overlap)
    if (localId < localInputOverlap) {
        int posInputStart = ((groupId*localSize) * decimationFactor) - (taps-1);
        int posInput      = posInputStart + localId;
        int posLocalInput = localId;

        localInput[posLocalInput] = 0.0f;
        if (posInput >= 0)
            localInput[posLocalInput] = input[posInput];
    }

    // read remaining global input to local input
    // 1. alternative: strided read
    // for (int i=0; i<decimationFactor; i++) {
    //     int posInputStart = (groupId*localSize) * decimationFactor;
    //     int posInput      = posInputStart    + localId * decimationFactor - i;
    //     int posLocalInput = localInputOffset + localId * decimationFactor - i;

    //     localInput[posLocalInput] = 0.0f;
    //     if ((posInput >= 0) && (posInput < bufferSize*decimationFactor))
    //         localInput[posLocalInput] = input[posInput];
    // }

    // 2. alternative: coalesced read (in blocks of localSize)
    for (int i=0; i<decimationFactor; i++) {
        int posInputStart = (groupId*localSize) * decimationFactor;
        int posInput      = posInputStart    - (decimationFactor-1) + i*localSize + localId;
        int posLocalInput = localInputOffset - (decimationFactor-1) + i*localSize + localId;

        localInput[posLocalInput] = 0.0f;
        if ((posInput >= 0) && (posInput < bufferSize*decimationFactor))
            localInput[posLocalInput] = input[posInput];
    }

    // 2. wait until every thread completed
    barrier(CLK_LOCAL_MEM_FENCE);

    // 3. convolution
    if (posOutput < bufferSize) {
        float result = 0.0f;
        for (int tap=0; tap<taps; tap++) {
            int posLocalInput = localInputOffset + (localId * decimationFactor) - tap;

            result += localInput[posLocalInput] * coefs[tap];
        }

        output[posOutput] = result;
    }
}

大进步！但是，性能仍然与总体操作无关（与抽取因子不成比例）：

与第一种方法相比，全卷积加速比：〜12％
计算抽取时间：
- 抽取系数2：61％
- 抽取系数4：46％
- 抽取系数8：53％
- 抽取因子16：68％

该性能的最佳抽取因子为4。为什么？有进一步改进的想法吗？

编辑2：具有共享本地内存的图：

编辑3：比较3种不同实现的性能

Answer 1

由于数据重叠量（66％），这可以受益于在工作组内的工作项之间共享从内存读取的数据。您可以摆脱多余的读取，并也进行合并读取。将您的内核分为两部分：第一部分将工作组中所需的所有数据合并读取到共享的本地内存中。然后是要同步的内存屏障。然后在第二部分中，使用来自共享本地内存的读取进行卷积。

P.S。感谢您提供的图表，它比尝试阅读代码帮助我更快地了解了您的目标。

OpenCL 1D跨越式卷积性能

1 个答案: