我在vhdl中创建了一个算法sha256的设计。 现在我试图通过了解如何更改代码来提高我的设计水平,这样我将获得更高的功率,性能和面积结果。最终的游戏目标是试图在我的设计中获得最好的网表,这样我就可以将它们变成芯片。
因此,对于我的设计:我在Cyclone 4 FPGA中获得了85 mhz的最大频率,总共使用了8,500个逻辑元件,占了FPGA的55%。
我认为我的设计如此之大的主要问题是我以层次结构的方式编写代码,很多“elsif”和变量。另外一件事可能会更好,我认为,如果quartus将我的内存设计实现为内存而不是逻辑元素,即使它只有16个32位字的数组。 你们认为我能改进什么?
library ieee;
use ieee.std_logic_1164.all;
use ieee.std_logic_unsigned.all;
USE ieee.numeric_std.ALL;
entity padding is
port( clk : in std_logic;
rst : in std_logic;
ward : in std_logic_vector(31 downto 0);
ready : out std_logic;
hash : out std_logic_vector(255 downto 0));
end;
architecture padding of padding is
component sha256
port ( clk : in std_logic;
rst : in std_logic;
enable : in std_logic;
ward : in std_logic_vector(31 downto 0);
k : in std_logic_vector(31 downto 0);
h0 : in std_logic_vector(31 downto 0);
h1 : in std_logic_vector(31 downto 0);
h2 : in std_logic_vector(31 downto 0);
h3 : in std_logic_vector(31 downto 0);
h4 : in std_logic_vector(31 downto 0);
h5 : in std_logic_vector(31 downto 0);
h6 : in std_logic_vector(31 downto 0);
h7 : in std_logic_vector(31 downto 0);
ready : out std_logic;
digest : out std_logic_vector(255 downto 0));
end component;
type kconst is array ( 0 to 63 ) of std_logic_vector(31 downto 0);
type mem is array ( 0 to 15 ) of std_logic_vector(31 downto 0);
signal k : kconst := (x"428a2f98", x"71374491", x"b5c0fbcf", x"e9b5dba5", x"3956c25b", x"59f111f1", x"923f82a4", x"ab1c5ed5",
x"d807aa98", x"12835b01", x"243185be", x"550c7dc3", x"72be5d74", x"80deb1fe", x"9bdc06a7", x"c19bf174",
x"e49b69c1", x"efbe4786", x"0fc19dc6", x"240ca1cc", x"2de92c6f", x"4a7484aa", x"5cb0a9dc", x"76f988da",
x"983e5152", x"a831c66d", x"b00327c8", x"bf597fc7", x"c6e00bf3", x"d5a79147", x"06ca6351", x"14292967",
x"27b70a85", x"2e1b2138", x"4d2c6dfc", x"53380d13", x"650a7354", x"766a0abb", x"81c2c92e", x"92722c85",
x"a2bfe8a1", x"a81a664b", x"c24b8b70", x"c76c51a3", x"d192e819", x"d6990624", x"f40e3585", x"106aa070",
x"19a4c116", x"1e376c08", x"2748774c", x"34b0bcb5", x"391c0cb3", x"4ed8aa4a", x"5b9cca4f", x"682e6ff3",
x"748f82ee", x"78a5636f", x"84c87814", x"8cc70208", x"90befffa", x"a4506ceb", x"bef9a3f7", x"c67178f2");
signal first_mem : mem:= ( x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000",
x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000");
signal second_mem : mem:= ( x"00000000", x"00000000", x"00000000", x"00000000", x"80000000", x"00000000", x"00000000", x"00000000",
x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000280");
signal enable : std_logic;
signal enable1 : std_logic;
signal enable2 : std_logic;
signal r_d : std_logic;
signal k_in : std_logic_vector(31 downto 0);
signal ward_in : std_logic_vector(31 downto 0);
signal ward_in1 : std_logic_vector(31 downto 0);
signal ward_in2 : std_logic_vector(31 downto 0);
signal h0,h1,h2,h3 : std_logic_vector(31 downto 0);
signal h4,h5,h6,h7 : std_logic_vector(31 downto 0);
signal temp : std_logic_vector(255 downto 0);
signal temp1 : std_logic_vector(255 downto 0);
signal gama0 : std_logic_vector(31 downto 0);
signal gama1 : std_logic_vector(31 downto 0);
signal gama2 : std_logic_vector(31 downto 0);
signal gama3 : std_logic_vector(31 downto 0);
signal gama4 : std_logic_vector(31 downto 0);
signal gama5 : std_logic_vector(31 downto 0);
begin
sha1: sha256 port map(
clk ,
rst ,
enable ,
ward_in ,
k_in ,
h0 ,
h1 ,
h2 ,
h3 ,
h4 ,
h5 ,
h6 ,
h7 ,
enable1 ,
temp );
sha2: sha256 port map(
clk ,
rst ,
enable1 ,
ward_in1 ,
k_in ,
temp(255 downto 224),
temp(223 downto 192),
temp(191 downto 160),
temp(159 downto 128),
temp(127 downto 96 ),
temp(95 downto 64 ),
temp(63 downto 32 ),
temp(31 downto 0 ),
enable2 ,
temp1 );
sha3: sha256 port map(
clk ,
rst ,
r_d ,
ward_in2 ,
k_in ,
h0 ,
h1 ,
h2 ,
h3 ,
h4 ,
h5 ,
h6 ,
h7 ,
ready ,
hash );
h0 <= x"6a09e667";
h1 <= x"bb67ae85";
h2 <= x"3c6ef372";
h3 <= x"a54ff53a";
h4 <= x"510e527f";
h5 <= x"9b05688c";
h6 <= x"1f83d9ab";
h7 <= x"5be0cd19";
process (clk,rst)
variable i : integer;
variable j : integer;
variable m : integer;
variable n : integer;
variable l : integer;
begin
if rst = '0' then
enable <= '0';
i := 0;
j := 0;
m := 9;
n := 15;
l := 8;
elsif clk'event and clk = '1' then
if j = 16 then
j := 0;
end if;
if m = 16 then
m := 0;
end if;
if n = 16 then
n := 0;
end if;
if l = 16 then
l := 0;
end if;
if i = 193 then
i := 0;
elsif i > 144 then
first_mem(n) <= gama4 + first_mem(l) + gama5 + first_mem(n);
ward_in2 <= gama4 + first_mem(l) + gama5 + first_mem(n);
k_in <= k(i-129);
elsif i > 136 then
ward_in2 <= first_mem(n);
k_in <= k(i-129);
elsif i = 136 then
first_mem(n) <= temp1(31 downto 0);
ward_in2 <= temp1(31 downto 0);
k_in <= k(i-129);
elsif i = 135 then
first_mem(n) <= temp1(63 downto 32);
ward_in2 <= temp1(63 downto 32);
k_in <= k(i-129);
elsif i = 134 then
first_mem(n) <= temp1(95 downto 64);
ward_in2 <= temp1(95 downto 64);
k_in <= k(i-129);
elsif i = 133 then
first_mem(n) <= temp1(127 downto 96);
ward_in2 <= temp1(127 downto 96);
k_in <= k(i-129);
elsif i = 132 then
first_mem(n) <= temp1(159 downto 128);
ward_in2 <= temp1(159 downto 128);
k_in <= k(i-129);
elsif i = 131 then
first_mem(n) <= temp1(191 downto 160);
ward_in2 <= temp1(191 downto 160);
k_in <= k(i-129);
elsif i = 130 then
first_mem(n) <= temp1(223 downto 192);
ward_in2 <= temp1(223 downto 192);
k_in <= k(i-129);
elsif i = 129 then
first_mem(15) <= x"00000100";
first_mem(14) <= x"00000000";
first_mem(13) <= x"00000000";
first_mem(12) <= x"00000000";
first_mem(11) <= x"00000000";
first_mem(10) <= x"00000000";
first_mem(9) <= x"00000000";
first_mem(8) <= x"80000000";
first_mem(n) <= temp1(255 downto 224);
ward_in2 <= temp1(255 downto 224);
k_in <= k(i-129);
elsif i = 128 then
elsif i > 79 then
second_mem(j) <= gama2 + second_mem(m) + gama3 + second_mem(j);
ward_in1 <= gama2 + second_mem(m) + gama3 + second_mem(j);
k_in <= k(i-64);
elsif i > 63 then
enable <= '0';
ward_in1 <= second_mem(j);
k_in <= k(i-64);
elsif i > 19 then
first_mem(j) <= gama0 + first_mem(m) + gama1 + first_mem(j);
ward_in <= gama0 + first_mem(m) + gama1 + first_mem(j);
k_in <= k(i);
enable <= '1';
elsif i > 15 then
second_mem(j)<= ward;
first_mem(j) <= gama0 + first_mem(m) + gama1 + first_mem(j);
ward_in <= gama0 + first_mem(m) + gama1 + first_mem(j);
k_in <= k(i);
enable <= '1';
elsif i >= 0 then
first_mem(i) <= ward;
ward_in <= ward;
k_in <= k(i);
enable <= '1';
end if;
i := i + 1;
j := j + 1;
m := m + 1;
n := n + 1;
l := l + 1;
end if;
end process;
process (clk, rst)
begin
if rst = '0' then
r_d <= '0';
elsif clk'event and clk = '1' then
r_d <= enable2;
end if;
end process;
process (clk, rst)
variable f: integer;
variable j: integer;
variable l: integer;
variable m: integer;
begin
if rst = '0' then
f := 2;
j := 15;
l := 1;
m := 14;
elsif clk'event and clk = '1' then
if j = 16 then
j := 0;
end if;
if f = 16 then
f := 0;
end if;
if l = 16 then
l := 0;
end if;
if m = 16 then
m := 0;
end if;
gama0 <= ((first_mem(f)(6 downto 0) & first_mem(f)(31 downto 7)) xor (first_mem(f)(17 downto 0) & first_mem(f)(31 downto 18)) xor ("000" & first_mem(f)(31 downto 3)));
gama1 <= ((first_mem(j)(16 downto 0) & first_mem(j)(31 downto 17)) xor (first_mem(j)(18 downto 0) & first_mem(j)(31 downto 19)) xor ("0000000000" & first_mem(j)(31 downto 10)));
gama4 <= ((first_mem(l)(6 downto 0) & first_mem(l)(31 downto 7)) xor (first_mem(l)(17 downto 0) & first_mem(l)(31 downto 18)) xor ("000" & first_mem(l)(31 downto 3)));
gama5 <= ((first_mem(m)(16 downto 0) & first_mem(m)(31 downto 17)) xor (first_mem(m)(18 downto 0) & first_mem(m)(31 downto 19)) xor ("0000000000" & first_mem(m)(31 downto 10)));
gama2 <= ((second_mem(f)(6 downto 0) & second_mem(f)(31 downto 7)) xor (second_mem(f)(17 downto 0) & second_mem(f)(31 downto 18)) xor ("000" & second_mem(f)(31 downto 3)));
gama3 <= ((second_mem(j)(16 downto 0) & second_mem(j)(31 downto 17)) xor (second_mem(j)(18 downto 0) & second_mem(j)(31 downto 19)) xor ("0000000000" & second_mem(j)(31 downto 10)));
f := f + 1;
j := j + 1;
l := l + 1;
m := m + 1;
end if;
end process;
end;
答案 0 :(得分:1)
elsif,即“优先级/解码”,将影响您的设计频率。利用您剩下的所有可用逻辑资源,您可以考虑一个案例陈述......除非您实际需要优先级/解码。即便如此,如果您能够提供延迟权衡,您可以在几个时钟周期内进行解码(管道解码),您的设计可能会增加频率......最终,您需要运行时序报告并查看慢速路径了解瓶颈。
如果你真的想使用RAM而不是FF,你可以推断一个RAM(创建一个数组),或者如果这对你不起作用,你可以手动实例化一个特定于设备的RAM ....然后,当然,为它添加控制逻辑。如果是原始的,则将其更改为blackbox,以便稍后交换“相同的”ASIC库原语
就“变量”而言,讨论与“VHDL”与“Verilog”相同,或“同步”与“异步”重置,大多只是意见,而我的是“我不是粉丝可合成RTL中的变量“...它们对于合成是合法的,但它们在syn期间”消失“,因此如果您想查看网表并与RTL进行比较,则可以手动跟踪连接。通常没有充分理由拥有变量,因为它们在硬件方面没有任何意义,并且模糊了设计与网表。我喜欢看到wire / net / regs的逻辑类型,因此很清楚你在HW中创建了什么。但是,如你所愿,当我看到它们时,我只会感到畏缩。
同样,就数组而言,我不是“将信号捆绑到数组中”的忠实粉丝......人们会认为它“更快”和“更容易”处理,但对我来说,它进一步模糊了设计。再说一次,这不是非法的,但是当谈到OPC(其他人的代码)时,尝试跟踪信号非常烦人,不仅是在模块内,而且是跨端口的数组......然后,如果它们切片那些数组,或者以其他方式摧毁它们,它变得更加烦人。有点像这种咆哮:)
最终,你可以做任何你想做的事情,特别是在FPGA中,有些人往往不太关注与ASIC相比会产生什么。如果你正在设计一个ASIC,我会说你应该更倾向于更迂腐,能够查看你的RTL并知道将在某种程度上创建什么(因此能够估计门)如果你需要,请数。为此,我强烈建议花时间在绘图程序(例如visio)中绘制您的设计,以包括门,FF,解码器,Mux,FSM,适当的伪代码,时钟和重置树的详细信息,以及所有CDC交叉逻辑等,包括信号名称。一旦你拥有了它,这只是转换为RTL的问题......也可能是对那些与变量有共同看法的人的奖励,你会发现你的绘图中没有变量,因此RTL中没有变量。 :)