将for循环转换为FPGA的最佳方法

时间:2015-10-07 13:24:43

标签: signal-processing verilog fpga

我无法理解如何使用for-loop在FPGA中最好地复制一些C代码(不是我第一次被困在这上面)。

C代码片段如下所示:

dot_product(&corr_sum, &sample_data_buffer[sample_index+d_circ_buf_size-sync_pattern_size], &sync_pattern[0], sync_pattern_size);
abs_corr_sum += abs(corr_sum);

非常直截了当,它采用两个复杂向量的点积并对其进行累积求和。

他试图复制它:

always @(sample_index)
begin
    // for each incoming sample
    abs_corr_sum = 64'd0;
    corr_sum = 64'd0;
    for (index2 = 0; index2 < sync_pattern_size; index2 = index2 + 1'b1)
    begin
        corr_sum = sample_data_buffer_I[index2+sample_index+circ_buf_size-sync_pattern_size] * sync_pattern_I[index2]
                   + sample_data_buffer_Q[index2+sample_index+circ_buf_size-sync_pattern_size] * sync_pattern_Q[index2];

        //this is my quick and dirty abs(corr_sum) summer
        abs_corr_sum = (corr_sum < 0) ? abs_corr_sum + ~$signed(corr_sum)+1 : abs_corr_sum + corr_sum;
    end // for (index2 = 0; index2 < sync_pattern_size; index2 = index2 + 1'b1)
end //always @(sample_index)

这看起来是对的吗?我没有得到我期待的结果;虽然问题可能在其他地方,但我认为这一部分是最可能的罪魁祸首。

1 个答案:

答案 0 :(得分:2)

要将来自带有循环,条件等的算法的代码转换为Verilog的可综合形式,您需要将其转换为FSM。

例如,你要求做的类似的for循环将是:

int sample_I[N], sync_I[N]; // assume 32-bit ints, 2-complement numbers.
int sample_Q[N], sync_Q[N];
int i, corsum, abscorsum = 0;

for (i=0;i<N;i++)
{
  corsum = sample_I[i] * sync_I[i] + sample_Q[i] * sync_Q[i];
  abscorsum += abs(corsum);
}

首先,将句子分组到时隙中,这样您就可以看到哪些动作可以在同一个时钟周期内完成(相同的状态),并为每个插槽分配一个状态:

1)

i = 0
abscorsum = 0
goto 2)

2)

if i!=N    
  corsum = sample_I[i] * sync_I[i]
  goto 3)
else
  goto 5)

3)

corsum = corsum + sample_Q[i] * sync_Q[i]
i = i + 1
goto 4)

4)

if (corsum >= 0)
  abscorsum = abscorsum + corsum
else
  abscorsum = abscorsum + (-corsum)
goto 2)

5)

STOP

状态2和3可以合并为单个状态,但这会迫使合成器推断出两个乘法器,此外,所得到的组合路径的传播延迟可能非常高,限制了此设计允许的时钟频率。因此,我将点积计算分为两部分,每部分使用单个乘法运算。如果指令合成器可以使用一个乘法器并将其共享用于两个操作,因为两者都发生在不同的时钟周期中。

转换为此模块: http://www.edaplayground.com/x/MEG

信号rst用于指示模块开始操作。模块引发finish以表示操作结束和输出有效性(abscorrsum

sample_Isync_isample_Qsync_Q使用内存块建模,i是要读取的元素的地址。大多数合成器将推断这些向量的块RAM,因为它们中的每一个都只在一个状态下读取,并且始终具有相同的地址信号。

module corrdotprod #(N=4) (
  input wire clk,
  input wire rst,
  output reg [31:0] i,
  input wire [31:0] sample_i,
  input wire [31:0] sync_i,
  input wire [31:0] sample_q,
  input wire [31:0] sync_q,
  output reg [31:0] abscorrsum,
  output reg finish
);

  parameter
    STATE1 = 3'd1,
    STATE2 = 3'd2,
    STATE3 = 3'd3,
    STATE4 = 3'd4,
    STATE5 = 3'd5;

  reg [31:0] corrsum;
  reg [2:0] state;

  always @(posedge clk) begin
    if (rst == 1'b1) begin
      state <= STATE1;
    end
    else begin
      case (state)
        STATE1:
          begin
            i <= 0;
            abscorrsum <= 0;
            finish <= 1'b0;
            state <= STATE2;
          end
        STATE2:
          begin
            if (i!=N) begin
              corrsum <= sample_i * sync_i; // synthesizer deals with multiplication
              state <= STATE3;
            end
            else begin
              state <= STATE5;
            end
          end
        STATE3:
          begin
            corrsum <= corrsum + sample_q * sync_q; // this product can share the multiplier as above
            i <= i + 1;
            state <= STATE4;
          end
        STATE4:
          begin
            if (corrsum[31] == 1'b0) // remember: 2-complement
              abscorrsum <= abscorrsum + corrsum;
            else
              abscorrsum <= abscorrsum + (~corrsum+1);
            state <= STATE2;
          end
        STATE5:
          finish <= 1'b1;
      endcase      
    end
  end
endmodule

可以使用这个简单的测试平台进行测试:

module tb;
  reg clk;
  reg rst;
  reg [31:0] sample_i[0:3];
  reg [31:0] sync_i[0:3];
  reg [31:0] sample_q[0:3];
  reg [31:0] sync_q[0:3];
  wire [31:0] i;
  wire [31:0] abscorrsum;

  corrdotprod #(.N(4)) uut  (clk, rst, i, sample_i[i], sync_i[i], sample_q[i], sync_q[i], abscorrsum, finish);

  integer tb_i, tb_corrsum, tb_abscorrsum;
  initial begin
    $dumpfile ("dump.vcd");
    $dumpvars (0, tb.uut);    

    sample_i[0] = 1;
    sample_i[1] = 2;
    sample_i[2] = 3;
    sample_i[3] = 4;

    sync_i[0] = 2;
    sync_i[1] = -2;
    sync_i[2] = 2;
    sync_i[3] = -2;

    sample_q[0] = -1;
    sample_q[1] = -2;
    sample_q[2] = -3;
    sample_q[3] = -4;

    sync_q[0] = 3;
    sync_q[1] = -3;
    sync_q[2] = 3;
    sync_q[3] = -3;

    clk = 0;

    rst = 1;
    #30;
    rst = 0;
    wait (finish == 1);
    $display ("ABSCORRSUM    = %d\n", abscorrsum);

    // Testing result from module
    tb_abscorrsum = 0;
    for (tb_i = 0; tb_i < 4; tb_i = tb_i + 1) begin
      tb_corrsum = sample_i[tb_i] * sync_i[tb_i] + sample_q[tb_i] * sync_q[tb_i];
      if (tb_corrsum<0)
        tb_corrsum = -tb_corrsum;
      tb_abscorrsum = tb_abscorrsum + tb_corrsum;
    end
    $display ("TB_ABSCORRSUM = %d\n", tb_abscorrsum);

    $finish;
  end

  always begin
    clk = #5 !clk;
  end
endmodule