我正在尝试为OpenCL矩阵乘法示例实现互斥体。这个想法是为每个个人添加一个项目实施一种工具,从而能够自动进行打印乘法次数。目前,我实现了自旋锁。内核代码是
#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif
__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
__global double *num,
// Widths of matrices.
int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
// Load the matrices to local memory. Note that the (x, y) indices
// are swapped for A_local and B_local. This affects the reads from
// A_local and B_local below and result in more efficient hardware.
//
// This is actually an optimization that the compiler can perform,
// but is shown here for illustration purposes.
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
// Wait for the entire block to be loaded.
barrier(CLK_LOCAL_MEM_FENCE);
// Do the dot product accumulation within this block. Fully unroll the loop.
// As a result of the swap of indices above, memory accesses to
// A_local and B_local are very efficient because each loop iteration
// accesses consecutive elements. This can be seen by unrolling the
// loop and analyzing the regions that are loaded:
// A_local[local_y][0..BLOCK_SIZE-1] and
// B_local[local_x][0..BLOCK_SIZE-1]
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
// Wait for the block to be fully consumed before loading the next
// block.
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
我在下面添加了Mutex
#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif
__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
__global double *num,
// Widths of matrices.
int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// ///////////////////////
__local mutex;
mutex = 0;
// ////////////////////////
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
// Load the matrices to local memory. Note that the (x, y) indices
// are swapped for A_local and B_local. This affects the reads from
// A_local and B_local below and result in more efficient hardware.
//
// This is actually an optimization that the compiler can perform,
// but is shown here for illustration purposes.
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
// Wait for the entire block to be loaded.
barrier(CLK_LOCAL_MEM_FENCE);
// Do the dot product accumulation within this block. Fully unroll the loop.
// As a result of the swap of indices above, memory accesses to
// A_local and B_local are very efficient because each loop iteration
// accesses consecutive elements. This can be seen by unrolling the
// loop and analyzing the regions that are loaded:
// A_local[local_y][0..BLOCK_SIZE-1] and
// B_local[local_x][0..BLOCK_SIZE-1]
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
// Mutex Implementation
while(atomic_cmpxchg(&mutex, 0, 1) == 1);
*num = *num+1; // = num_mul;
atomic_xchg(&mutex, 0);
}
// End Mutex
// Wait for the block to be fully consumed before loading the next
// block.
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
当我使用仿真器编译程序时,编译正确并可以正确运行,但是完成运行需要很长时间,这很明显是因为锁定。
当我使用FPGA板Stratix V对其进行编译时,它将无法编译,并且会出现以下错误
aoc: OpenCL parser completed successfully. aoc: Optimizing and doing static analysis of code... aoc: Linking with IP library ... Checking if memory usage is larger than 100% Compiler Warning: Vectorized kernel contains loads/stores that cannot be vectorized. This might reduce performance. aoc: First stage compilation completed successfully. Compiling for FPGA. This process may take a long time, please be patient. Error (293007): Current module quartus_map ended unexpectedly. Verify that you have sufficient memory available to compile your design. Error: Flow compile (for project /home/tanash/Music/matrix_mult_mutex_board_fp_no/bin/matrix_mult/top) was not successful Error: ERROR: Error(s) found while running an executable. See report file(s) for error message(s). Message log indicates which executable was run last. Error (23031): Evaluation of Tcl script /home/tanash/Build/intelFPGA/17.1/quartus/common/tcl/internal/qsh_flow.tcl unsuccessful Error: Quartus Prime Shell was unsuccessful. 4 errors, 2965 warnings Error: Compiler Error, not able to generate hardware
我想知道我的代码有什么问题以及如何实现Mutex?我的互斥锁实现不对吗?
答案 0 :(得分:0)
我尝试发表评论,但我的代表还不够。 不是确切的解决方案-或答案-但您可以理解。 在fpga上下文原子中,互斥锁在Xilinx实现上是有问题的,而不是未实现的。
https://forums.xilinx.com/t5/SDAccel/XOCC-fails-with-atomic-functions/m-p/857170#M2054 https://forums.xilinx.com/t5/SDAccel/atomic-inc-needed/m-p/859822#M2115