用VHDL从零开始写RISC-V指令集核——一个简单的开始
0 写在前面的话
这个是在参考国外一个大神Domipheus搭建16位RISC-V操作系统核的过程的情况下(http://labs.domipheus.com/blog/tpu-series-quick-links/),进行的个人笔记的整理。用来记录自己从一个VHDL小白,逐渐搭建自己操作系统核的过程。
希望各位大佬们在看过的的博客之后提出我写的错误和不足的地方,对于一个新人来说这是很大的鼓励。
1 从寄存器开始
我们假设操作系统内核里包含一组寄存器,一共有8个寄存器,每个寄存器都是16位的。我们首先要做的就是能够进行寄存器的读和写。由于操作系统的核在进行计算的时候经常使用两个数据作为输入一个数据作为输出,因此我们的寄存器模块应该能够同时进行两个数据的读操作和一个数据的写操作。具体的实现代码如下所示。
library IEEE; use IEEE.STD_LOGIC_1164.ALL; use IEEE.NUMERIC_STD.ALL; entity reg16_8 is port ( I_clk : in std_logic; I_en : in std_logic; I_dataD : in std_logic_vector(15 downto 0); O_dataA : out std_logic_vector(15 downto 0); O_dataB : out std_logic_vector(15 downto 0); I_selA : in std_logic_vector(2 downto 0); I_selB : in std_logic_vector(2 downto 0); I_selD : in std_logic_vector(2 downto 0); I_we : in std_logic ); end reg16_8; architecture Behavioral of reg16_8 is type store_t is array(0 to 7) of std_logic_vector(15 downto 0); signal regs : store_t := (others => x"0000"); begin process(I_clk) begin if rising_edge(I_clk) and I_en='1'then O_dataA <= regs(to_integer(unsigned(I_selA))); O_dataB <= regs(to_integer(unsigned(I_selB))); if(I_we='1')then regs(to_integer(unsigned(I_selD))) <= I_dataD; end if; end if; end process; end Behavioral;
2 指令的解码
操作系统的核在进行运行的时候首先要进行指令的解码,也就是需要知道输入了一条指令,这个指令是用来干什么的。首先我们这个简单的操作系统的内核里所有的指令都是16位等长的。下面对这些指令进行定义。一共有下面16条指令,实现了加法减法、与或非、寄存器操作、移位和跳转等功能,并包含两条保留位。

通过上面的图可以看出这些指令有不同的格式,这些格式的定义如下

因此我们要做的就是根据上面指令的格式,对输入的指令进行解码。具体的程序如下。
library IEEE; use IEEE.STD_LOGIC_1164.ALL; entity decode is port( I_clk : in std_logic; I_dataInst : in std_logic_vector(15 downto 0); I_en : in std_logic; O_selA : out std_logic_vector(2 downto 0); O_selB : out std_logic_vector(2 downto 0); O_selD : out std_logic_vector(2 downto 0); O_dataIMM : out std_logic_vector(15 downto 0); --这里为什么要设置16位,明明在寄存器图里显示的只有后8位 O_regDwe : out std_logic; O_aluop : out std_logic_vector(4 downto 0) ); end decode; architecture Behavioral of decode is signal s_result:std_logic_vector(17 downto 0) :=(others=>'0'); signal s_shouldBranch: std_logic:='0'; begin process(I_clk) begin if(rising_edge(I_clk) and I_en='1') then O_selA <= I_dataInst(7 downto 5); O_selB <= I_dataInst(4 downto 2); O_selD <= I_dataInst(11 downto 9); O_dataIMM <= I_dataInst(7 downto 0) & I_dataInst(7 downto 0); O_aluop <= I_dataInst(15 downto 12)&I_dataInst(8); --第8位表示是否是符号数,当时0时表示无符号数,当是1时表示有符号数 case (I_dataInst(15 downto 12)) is when "0111" => --write 这三条语句不需要往寄存器里面写东西 O_regDwe <= '0'; when "1100" => --jump O_regDwe <= '0'; when "1101" => --jumpEQ O_regDwe <= '0'; when others => O_regDwe <= '1'; end case; end if; end process; end Behavioral;
3 ALU单元
在准备好寄存器,并能够进行指令解码之后,我们可以进行数学和逻辑功能单元的实现。以加法运算为例我们能够通过解码得到两个加数的寄存器地址和加法运算符号的解析。通过寄存器模块能够得到两个加数具体是什么,由此可以进行加法操作。具体的程序如下。
library IEEE;
use IEEE.STD_LOGIC_1164.ALL;
use IEEE.NUMERIC_STD.ALL;
-- Uncomment the following library declaration if instantiating
-- any Xilinx primitives in this code.
--library UNISIM;
--use UNISIM.VComponents.all;
port(
I_clk : in std_logic;
I_en : in std_logic;
I_dataA : in std_logic_vector(15 downto 0);
I_dataB : in std_logic_vector(15 downto 0);
I_dataDwe : in std_logic;
I_aluop : in std_logic_vector(4 downto 0);
I_PC : in std_logic_vector(15 downto 0);
I_dataIMM : in std_logic_vector(15 downto 0);
O_dataResult : out std_logic_vector(15 downto 0);
O_dataWriteReg : out std_logic;
O_shoudlBranch : out std_logic
);
signal S_dataResult : std_logic_vector(16 downto 0);
signal S_shouldBranch : std_logic;
begin
process(I_clk)
begin
if (rising_edge(I_clk) and I_en='1') then
O_dataWriteReg <= I_dataDwe;
case (I_aluop(4 downto 1)) is
when "0000" => --add
if (I_aluop(0)='0') then
S_dataResult <= std_logic_vector(unsigned('0'&I_dataA) + unsigned('0'&I_dataB));
else
S_dataResult <= std_logic_vector(signed(I_dataA(15)&I_dataA)+signed(I_dataB(15)&I_dataB));
end if;
S_shouldBranch <= '0';
when "0001" => --sub
if (I_aluop(0)='0') then
S_dataResult <= std_logic_vector(unsigned('0'&I_dataA) - unsigned('0'&I_dataB));
else
S_dataResult(15 downto 0) <= std_logic_vector(signed(I_dataA(15)&I_dataA)-signed(I_dataB(15)&I_dataB));
end if;
S_shouldBranch <= '0';
when "0010" => --or
S_dataResult(15 downto 0) <= I_dataA or I_dataB;
S_shouldBranch <= '0';
when "0011" => --xor
S_dataResult(15 downto 0) <= I_dataA xor I_dataB;
S_shouldBranch <= '0';
when "0100" => --and
S_dataResult(15 downto 0) <= I_dataA and I_dataB;
S_shouldBranch <= '0';
when "0101" => --and
S_dataResult(15 downto 0) <= not I_dataA;
S_shouldBranch <= '0';
when "0110" => --read
--get the address
--last 5bit of imm is a offset
S_dataResult(15 downto 0) <= std_logic_vector(unsigned(I_dataA)+unsigned(I_dataIMM(4 downto 0)));
S_shouldBranch <= '0';
when "0111" => --write
--get the write address
S_dataResult(15 downto 0) <= std_logic_vector(unsigned(I_dataA)+unsigned(I_dataIMM(4 downto 0)));
S_shouldBranch <= '0';
when "1000" => --load 这个load是干啥的 为啥这么写啊,这样的话16bit里不是一直有8位是0吗
if (I_aluop(0)='0') then
S_dataResult(15 downto 0) <= I_dataIMM(7 downto 0) & X"00";
else
S_dataResult(15 downto 0) <= X"00" & I_dataIMM(7 downto 0);
end if;
S_shouldBranch <= '0';
if I_dataA=I_dataB then
S_dataResult(14) <= '1';
else
S_dataResult(14) <= '0';
end if;
S_dataResult(11) <= '1';
else
S_dataResult(11) <= '0';
end if;
S_dataResult(10) <= '1';
else
S_dataResult(10) <= '0';
end if;
if (unsigned(I_dataA)>unsigned(I_dataB)) then
S_dataResult(13) <= '1';
else
S_dataResult(13) <= '0';
end if;
else
if (signed(I_dataA)<signed(I_dataB)) then
S_dataResult(12) <= '1';
else
S_dataResult(12) <= '0';
end if;
end if;
S_dataResult(16 downto 15) <= "00";
S_dataResult(9 downto 0) <= "00" & x"00";
S_shouldBranch <= '0';
when "1010" => --shift left
case (I_dataB(3 downto 0)) is
when "0001" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),1));
when "0010" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),2));
when "0011" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),3));
when "0100" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),4));
when "0101" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),5));
when "0110" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),6));
when "0111" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),7));
when "1000" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),8));
when "1001" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),9));
when "1010" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),10));
when "1011" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),11));
when "1100" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),12));
when "1101" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),13));
when "1110" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),14));
when "1111" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_left(unsigned(I_dataA),15));
when others =>
S_dataResult(15 downto 0) <= I_dataA;
end case;
S_shouldBranch <= '0';
when "1011" => --shift right
case (I_dataB(3 downto 0)) is
when "0001" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),1));
when "0010" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),2));
when "0011" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),3));
when "0100" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),4));
when "0101" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),5));
when "0110" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),6));
when "0111" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),7));
when "1000" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),8));
when "1001" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),9));
when "1010" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),10));
when "1011" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),11));
when "1100" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),12));
when "1101" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),13));
when "1110" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),14));
when "1111" =>
S_dataResult(15 downto 0) <= std_logic_vector(shift_right(unsigned(I_dataA),15));
when others =>
S_dataResult(15 downto 0) <= I_dataA;
end case;
S_shouldBranch <= '0';
if (I_aluop(0)='0') then
--set PC to reg(a)
S_dataResult(15 downto 0) <= I_dataA;
else
S_dataResult(15 downto 0) <= std_logic_vector(signed(I_PC)+signed(I_dataIMM(10 downto 0)&'0')); --所以这里的格式是什么格式
end if;
S_shouldBranch <= '1';
if (I_aluop(0)='0') then
--这里是设置的什么分支号,是怎么计算的啊
S_dataResult(15 downto 0) <= I_dataB;
else
S_dataResult(15 downto 0) <= std_logic_vector(signed(I_PC)+signed(I_dataIMM(4 downto 0)));
end if;
case (I_dataIMM(15 downto 13)) is
when "000" =>
S_shouldBranch <= I_dataA(14);
when "001" =>
S_shouldBranch <= I_dataA(11);
when "010" =>
S_shouldBranch <= I_dataA(10);
when "011" =>
S_shouldBranch <= not I_dataA(11);
when "100" =>
S_shouldBranch <= not I_dataA(10);
when "101" =>
S_shouldBranch <= I_dataA(13);
when "110" =>
S_shouldBranch <= I_dataA(12);
when others =>
S_shouldBranch <= '0';
end case;
when others =>
null;
end case;
O_dataResult <= S_dataResult(15 downto 0);
O_shoudlBranch <= S_shouldBranch;
end if;
end process;
end Behavioral;
4 把它们整合到一起
把它们整合到一起实现一个基本的不能再基本的操作系统核,也就是写一个top文件把这三个模块的信号连接起来。具体的代码如下所示。
library IEEE; use IEEE.STD_LOGIC_1164.ALL; use IEEE.NUMERIC_STD.ALL; -- Uncomment the following library declaration if instantiating -- any Xilinx primitives in this code. --library UNISIM; --use UNISIM.VComponents.all; entity core is port( I_clk : in std_logic; I_en : in std_logic; I_PC : in std_logic_vector(15 downto 0); I_dataInst : in std_logic_vector(15 downto 0); O_dataResult : out std_logic_vector(15 downto 0); O_shoudlBranch : out std_logic ); end core; architecture Behavioral of core is signal selA : std_logic_vector(2 downto 0); signal selB : std_logic_vector(2 downto 0); signal selD : std_logic_vector(2 downto 0); signal dataIMM : std_logic_vector(15 downto 0); signal regDwe : std_logic; signal aluop : std_logic_vector(4 downto 0); signal dataA : std_logic_vector(15 downto 0); signal dataB : std_logic_vector(15 downto 0); signal dataWriteReg : std_logic; signal dataResult: std_logic_vector(15 downto 0); component decode port( I_clk : in std_logic; I_dataInst : in std_logic_vector(15 downto 0); I_en : in std_logic; O_selA : out std_logic_vector(2 downto 0); O_selB : out std_logic_vector(2 downto 0); O_selD : out std_logic_vector(2 downto 0); O_dataIMM : out std_logic_vector(15 downto 0); O_regDwe : out std_logic; O_aluop : out std_logic_vector(4 downto 0) ); end component; component alu port ( I_clk : in std_logic; I_en : in std_logic; I_dataA : in std_logic_vector(15 downto 0); I_dataB : in std_logic_vector(15 downto 0); I_dataDwe : in std_logic; I_aluop : in std_logic_vector(4 downto 0); I_PC : in std_logic_vector(15 downto 0); I_dataIMM : in std_logic_vector(15 downto 0); O_dataResult : out std_logic_vector(15 downto 0); O_dataWriteReg : out std_logic; O_shoudlBranch : out std_logic ); end component; component reg16_8 port ( I_clk : in std_logic; I_en : in std_logic; I_dataD : in std_logic_vector(15 downto 0); O_dataA : out std_logic_vector(15 downto 0); O_dataB : out std_logic_vector(15 downto 0); I_selA : in std_logic_vector(2 downto 0); I_selB : in std_logic_vector(2 downto 0); I_selD : in std_logic_vector(2 downto 0); I_we : in std_logic ); end component; begin decode0 : decode port map ( I_clk => I_clk, I_dataInst => I_dataInst, I_en => I_en, O_selA => selA, O_selB => selB, O_selD => selD, O_dataIMM => dataIMM, O_regDwe => regDwe, O_aluop => aluop ); reg16_80 : reg16_8 port map ( I_clk => I_clk, I_en => I_en, I_dataD => dataResult, O_dataA => dataA, O_dataB => dataB, I_selA => selA, I_selB => selB, I_selD => selD, I_we => dataWriteReg ); alu0 : alu port map( I_clk => I_clk, I_en => I_en, I_dataA => dataA, I_dataB => dataB, I_dataDwe => regDwe, I_aluop => aluop, I_PC => I_PC, I_dataIMM => dataIMM, O_dataResult => dataResult, O_dataWriteReg => dataWriteReg, O_shoudlBranch => O_shoudlBranch ); O_dataResult<=dataResult; end Behavioral;
下面是这个core的testbench,进行了一个简单的验证
-------------------------------------------------------------------------------- LIBRARY ieee; USE ieee.std_logic_1164.ALL; -- Uncomment the following library declaration if using -- arithmetic functions with Signed or Unsigned values --USE ieee.numeric_std.ALL; ENTITY core_tb IS END core_tb; ARCHITECTURE behavior OF core_tb IS -- Component Declaration for the Unit Under Test (UUT) COMPONENT core PORT( I_clk : IN std_logic; I_en : IN std_logic; I_PC : IN std_logic_vector(15 downto 0); I_dataInst : IN std_logic_vector(15 downto 0); O_dataResult : OUT std_logic_vector(15 downto 0); O_shoudlBranch : OUT std_logic ); END COMPONENT; --Inputs signal I_clk : std_logic := '0'; signal I_en : std_logic := '0'; signal I_PC : std_logic_vector(15 downto 0) := (others => '0'); signal I_dataInst : std_logic_vector(15 downto 0) := (others => '0'); --Outputs signal O_dataResult : std_logic_vector(15 downto 0); signal O_shoudlBranch : std_logic; -- Clock period definitions constant I_clk_period : time := 10 ns; BEGIN -- Instantiate the Unit Under Test (UUT) uut: core PORT MAP ( I_clk => I_clk, I_en => I_en, I_PC => I_PC, I_dataInst => I_dataInst, O_dataResult => O_dataResult, O_shoudlBranch => O_shoudlBranch ); -- Clock process definitions I_clk_process :process begin I_clk <= '0'; wait for I_clk_period/2; I_clk <= '1'; wait for I_clk_period/2; end process; -- Stimulus process stim_proc: process begin I_en<='1'; I_PC<=(others=>'0'); wait for I_clk_period*0.5; I_dataInst<="1000"&"1101"& x"F1"; wait for I_clk_period*3; I_dataInst<="1000"&"0010"& x"F1"; wait for I_clk_period*3; I_dataInst<="0000000011000100"; wait for I_clk_period*3; I_dataInst<="0001010000111000"; wait; end process; END;
验证的结果如下

这个验证仅包括了解码,寄存器取数和计算3个过程因此有3个时钟的延时,但是这个延迟仅仅是在testbench中通过手动控制实现的,后面需要加上一个控制模块,使解码、寄存器和ALU能够按照控制的节拍进行顺序的计算。
5 写在结尾的话
希望各位大佬们在看过的的博客之后提出我写的错误和不足的地方,对于一个新人来说这是很大的鼓励。
如果能够被有幸转载请您联系我,我将十分感激Thanks♪(・ω・)ノ

浙公网安备 33010602011771号