Question

我正在尝试为OpenCL矩阵乘法示例实现互斥体。这个想法是为每个个人添加一个项目实施一种工具，从而能够自动进行打印乘法次数。目前，我实现了自旋锁。内核代码是

#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif

__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
             __global float *restrict C,
             __global float *A,
             __global float *B,
             __global double *num,
             // Widths of matrices.
             int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];

// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);

// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);

// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end   = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;

float running_sum = 0.0f;

// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
    // Load the matrices to local memory. Note that the (x, y) indices
    // are swapped for A_local and B_local. This affects the reads from
    // A_local and B_local below and result in more efficient hardware.
    //
    // This is actually an optimization that the compiler can perform,
    // but is shown here for illustration purposes.
    A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
    B_local[local_x][local_y] = B[b + B_width * local_y + local_x];

    // Wait for the entire block to be loaded.
    barrier(CLK_LOCAL_MEM_FENCE);

    // Do the dot product accumulation within this block. Fully unroll the loop.
    // As a result of the swap of indices above, memory accesses to
    // A_local and B_local are very efficient because each loop iteration
    // accesses consecutive elements. This can be seen by unrolling the
    // loop and analyzing the regions that are loaded:
    //  A_local[local_y][0..BLOCK_SIZE-1] and
    //  B_local[local_x][0..BLOCK_SIZE-1]

    #pragma unroll
   for (int k = 0; k < BLOCK_SIZE; ++k)
    {
    running_sum += A_local[local_y][k] * B_local[local_x][k];
   // Wait for the block to be fully consumed before loading the next
    // block.
    barrier(CLK_LOCAL_MEM_FENCE);
}

// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}

我在下面添加了Mutex

#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif

__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
             __global float *restrict C,
             __global float *A,
             __global float *B,
             __global double *num,
             // Widths of matrices.
             int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// ///////////////////////
__local mutex;
mutex = 0;

// ////////////////////////

// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);

// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);

// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end   = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;

float running_sum = 0.0f;

// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
    // Load the matrices to local memory. Note that the (x, y) indices
    // are swapped for A_local and B_local. This affects the reads from
    // A_local and B_local below and result in more efficient hardware.
    //
    // This is actually an optimization that the compiler can perform,
    // but is shown here for illustration purposes.
    A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
    B_local[local_x][local_y] = B[b + B_width * local_y + local_x];

    // Wait for the entire block to be loaded.
    barrier(CLK_LOCAL_MEM_FENCE);

    // Do the dot product accumulation within this block. Fully unroll the loop.
    // As a result of the swap of indices above, memory accesses to
    // A_local and B_local are very efficient because each loop iteration
    // accesses consecutive elements. This can be seen by unrolling the
    // loop and analyzing the regions that are loaded:
    //  A_local[local_y][0..BLOCK_SIZE-1] and
    //  B_local[local_x][0..BLOCK_SIZE-1]

    #pragma unroll
   for (int k = 0; k < BLOCK_SIZE; ++k)
    {
    running_sum += A_local[local_y][k] * B_local[local_x][k];
   // Mutex Implementation
    while(atomic_cmpxchg(&mutex, 0, 1) == 1);
        *num = *num+1; // = num_mul;
        atomic_xchg(&mutex, 0);
    }
   // End Mutex
   // Wait for the block to be fully consumed before loading the next
    // block.
    barrier(CLK_LOCAL_MEM_FENCE);
}

// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}

当我使用仿真器编译程序时，编译正确并可以正确运行，但是完成运行需要很长时间，这很明显是因为锁定。

当我使用FPGA板Stratix V对其进行编译时，它将无法编译，并且会出现以下错误

aoc: OpenCL parser completed successfully.
  aoc: Optimizing and doing static analysis of code...
  aoc: Linking with IP library ...
  Checking if memory usage is larger than 100%
  Compiler Warning: Vectorized kernel contains loads/stores that cannot        be vectorized. This might reduce performance.
  aoc: First stage compilation completed successfully.
  Compiling for FPGA. This process may take a long time, please be patient.
  Error (293007): Current module quartus_map ended unexpectedly. Verify that you have sufficient memory available to compile your design.
 Error: Flow compile (for project /home/tanash/Music/matrix_mult_mutex_board_fp_no/bin/matrix_mult/top) was not successful
 Error: ERROR: Error(s) found while running an executable. See report file(s) for error message(s). Message log indicates which executable was run last.
 Error (23031): Evaluation of Tcl script /home/tanash/Build/intelFPGA/17.1/quartus/common/tcl/internal/qsh_flow.tcl unsuccessful
 Error: Quartus Prime Shell was unsuccessful. 4 errors, 2965 warnings
 Error: Compiler Error, not able to generate hardware

我想知道我的代码有什么问题以及如何实现Mutex？我的互斥锁实现不对吗？

Answer 1

我尝试发表评论，但我的代表还不够。不是确切的解决方案-或答案-但您可以理解。在fpga上下文原子中，互斥锁在Xilinx实现上是有问题的，而不是未实现的。

https://forums.xilinx.com/t5/SDAccel/XOCC-fails-with-atomic-functions/m-p/857170#M2054 https://forums.xilinx.com/t5/SDAccel/atomic-inc-needed/m-p/859822#M2115

在适用于矩阵乘法示例的OpenCL中实现互斥量

1 个答案: