我们的设计为五周期流水线工作模式的cpu,实现了RV32I的基本指令集37条。最终运行实现了流水灯和计算四位整数以内的质数数量的C程序代码。下为我们设计的数字系统的框图。
整个数字系统大致分为5个部分:时钟生成电路(既有调用MMCM的ip核,也有自己编写的分频器)、输入输出电路(包括按钮、开关的输入和led灯、数码管和vga的输出)、运算控制电路(CPU)、存储电路(哈佛结构,即存程序的ROM和存数据的RAM分开)、及总线。涉及的宏定义如下:
//general define
`define RstEnable 1'b1
`define RstDisable 1'b0
`define ZeroWord 32'h0
`define ZeroHalf 16'h0
`define WriteEnable 1'b1
`define WriteDisable 1'b0
`define ReadEnable 1'b1
`define ReadDisable 1'b0
`define AluOpBus 7:0
`define AluSelBus 2:0
`define InstValid 1'b0
`define InstInvalid 1'b1
`define True_v 1'b1
`define False_v 1'b0
`define ChipEnable 1'b1
`define ChipDisable 1'b0
`define Branch 1'b1
`define NotBranch 1'b0
`define Stop 1'b1
//Using for instruction
`define EXE_I_INST 7'b0010011
`define EXE_R_INST 7'b0110011
`define EXE_B_INST 7'b1100011
`define EXE_J_INST 7'b1101111
`define EXE_JI_INST 7'b1100111
`define EXE_U_INST 7'b0110111
`define EXE_UPC_INST 7'b0010111
`define EXE_NOP 7'b0000000
`define EXE_IL_INST 7'b0000011
`define EXE_S_INST 7'b0100011
`define EXE_IE_INST 7'b1110011
`define EXE_FUNCT_ADD 3'b000
`define EXE_FUNCT_SLL 3'b001
`define EXE_FUNCT_SLT 3'b010
`define EXE_FUNCT_SLTU 3'b011
`define EXE_FUNCT_XOR 3'b100
`define EXE_FUNCT_SR 3'b101
`define EXE_FUNCT_OR 3'b110
`define EXE_FUNCT_AND 3'b111
`define EXE_FUNCT_SLL 3'b001
`define EXE_FUNCT_SRL 3'b101
`define EXE_FUNCT_BEQ 3'b000
`define EXE_FUNCT_BNE 3'b001
`define EXE_FUNCT_BLT 3'b100
`define EXE_FUNCT_BGE 3'b101
`define EXE_FUNCT_BLTU 3'b110
`define EXE_FUNCT_BGEU 3'b111
`define EXE_FUNCT_LB 3'b000
`define EXE_FUNCT_LH 3'b001
`define EXE_FUNCT_LW 3'b010
`define EXE_FUNCT_LBU 3'b100
`define EXE_FUNCT_LHU 3'b101
`define EXE_FUNCT_SB 3'b000
`define EXE_FUNCT_SH 3'b001
`define EXE_FUNCT_SW 3'b010
`define EXE_FUNCT_JALR 3'b000
`define EXE_FUNCT_EC 3'b000
`define EXE_FUNCT_EB 3'b000
//AluOp
`define EXE_NOP_OP 8'b00000000
`define EXE_SAVE_OP 8'b00000001
`define EXE_XOR_OP 8'b00010100
`define EXE_AND_OP 8'b00010111
`define EXE_OR_OP 8'b00010110
`define EXE_SLL_OP 8'b00010001
`define EXE_SRL_OP 8'b00010101
`define EXE_SRA_OP 8'b00110101
`define EXE_ADD_OP 8'b00010000
`define EXE_SLT_OP 8'b00010010
`define EXE_SLTU_OP 8'b00010011
`define EXE_SUB_OP 8'b00110000
`define EXE_LB_OP 8'b01000001
`define EXE_LH_OP 8'b01000010
`define EXE_LW_OP 8'b01000011
`define EXE_LBU_OP 8'b01000100
`define EXE_LHU_OP 8'b01000101
`define EXE_SB_OP 8'b01000110
`define EXE_SH_OP 8'b01000111
`define EXE_SW_OP 8'b01001000
//AluSel
`define EXE_RES_NOP 3'b000
`define EXE_RES_LOGIC 3'b001
`define EXE_RES_SHIFT 3'b010
`define EXE_RES_ARITHMETIC 3'b011
`define EXE_RES_JUMP 3'b100
`define EXE_RES_LOAD 3'b101
`define EXE_RES_STORE 3'b110
//Using for Rom
`define RomAddrBus 12:0
`define RomAddrBusSel 14:2
`define InstAddrBus 31:0
`define InstBus 31:0
//Using for Ram and IO
`define DataAddrBus 31:0
`define ByteSelBus 3:0
`define DataBus 31:0
`define DvcSelBusSel 19:16
`define DS_RAM 4'b0000
`define DS_BTN 4'b0100
`define DS_SW 4'b0101
`define DS_LED 4'b1000
`define DS_TUBE 4'b1001
`define DS_VGA 4'b1010
`define RamAddrBus 14:0
`define LedBus 15:0
`define TubeDinBus 27:0
//Using for Regfile
`define RegAddrBus 4:0
`define RegBus 31:0
`define RegWidth 32
`define DoubleRegWidth 64
`define DoubleRegBus 63:0
`define RegNum 32
`define RegNumLog2 5
`define NOPRegAddr 5'b00000
顶层模块代码如下:
`include "define.v"
module top_module(
input clk, //100MHz clock signal from crystal oscillator
input rst, //Asynchronous reset signal, from sw[15]
input [ 4:0] btn, //Central--4, up--3, down--2, left--1, right--0
input [14:0] sw,
output [15:0] led,
output [11:0] tube, //[11:8]--digits, [7:0]--segments & dots
output [13:0] vga //13--hsync, 12--vsync, [11:0]--rgb
);
//generated clock
wire clk_cpu; //37MHz clock for main modules
wire clk_mem; //74MHz clock for ROM and RAM
wire clk_vga; //25MHz clock for VGA display
wire clk_tube; //200Hz clock for digital tube
wire clk_flash; //1Hz clock for flash
//connect CPU with BUS
wire cpu_rom_ce;
wire [`InstAddrBus] cpu_rom_addr;
wire [`InstBus] cpu_rom_data;
wire cpu_ram_ce;
wire [`ByteSelBus] cpu_ram_we;
wire [`DataAddrBus] cpu_ram_addr;
wire [`DataBus] cpu_ram_din;
wire [`DataBus] cpu_ram_dout;
//connect ROM with BUS
wire rom_ce;
wire [`RomAddrBus] rom_addr;
wire [`InstBus] rom_data;
//connect RAM with BUS
wire ram_ce;
wire [`ByteSelBus] ram_we;
wire [`RamAddrBus] ram_addr;
wire [`DataBus] ram_din;
wire [`DataBus] ram_dout;
//connect BTN with BUS
wire [4:0] btn_ena;
wire [4:0] btn_data;
//connect LED with BUS
wire led_ena;
wire [`LedBus] led_din;
//connect TUBE with BUS
wire tube_ena;
wire [`TubeDinBus] tube_din;
//connect MENU with BUS
wire menu_ena;
wire [1:0] menu_ctrl;
//instantiation of clock generators
CLK_GEN CLK_GEN0(
.clk_i(clk), .clk_o1(clk_cpu), .clk_o2(clk_mem)
);
CLK_GEN_1 CLK_GEN1(
.clk_i(clk), .clk_o(clk_vga)
);
CLK_GEN_LOW #(50_0000) CLK_GEN_LOW0 (
.clk_i(clk), .rst(rst), .clk_o(clk_tube)
);
CLK_GEN_LOW #(200) CLK_GEN_LOW1 (
.clk_i(clk_tube), .rst(rst), .clk_o(clk_flash)
);
//instantiation of CPU
CPU CPU0(
.clk(clk_cpu), .rst(rst),
//data from and to ROM
.rom_data_i(cpu_rom_data), .rom_ce_o(cpu_rom_ce), .rom_addr_o(cpu_rom_addr),
//data from and to RAM
.ram_data_i(cpu_ram_dout), .ram_ce_o(cpu_ram_ce), .ram_we_o(cpu_ram_we),
.ram_addr_o(cpu_ram_addr), .ram_data_o(cpu_ram_din)
);
//instantiation of ROM
ROM ROM0(
.clka(clk_mem), .ena(rom_ce),
.addra(rom_addr), .douta(rom_data)
);
//instantiation of RAM
RAM RAM0(
.clka(clk_mem), .ena(ram_ce), .wea(ram_we),
.addra(ram_addr), .dina(ram_din), .douta(ram_dout)
);
//instantiation of BUS
BUS BUS0(
//data from and to CPU
.cpu_rom_ce_i(cpu_rom_ce), .cpu_rom_addr_i(cpu_rom_addr),
.cpu_rom_data_o(cpu_rom_data),
.cpu_ram_ce_i(cpu_ram_ce), .cpu_ram_we_i(cpu_ram_we),
.cpu_ram_addr_i(cpu_ram_addr), .cpu_ram_din_i(cpu_ram_din),
.cpu_ram_dout_o(cpu_ram_dout),
//data from and to ROM
.rom_ce_o(rom_ce), .rom_addr_o(rom_addr), .rom_data_i(rom_data),
//data from and to RAM
.ram_ce_o(ram_ce), .ram_we_o(ram_we), .ram_addr_o(ram_addr),
.ram_din_o(ram_din),.ram_dout_i(ram_dout),
//data from input
.sw_i(sw), .btn_ena_o(btn_ena), .btn_data_i(btn_data),
//data to output
.led_ena_o(led_ena), .led_din_o(led_din),
.tube_ena_o(tube_ena), .tube_din_o(tube_din),
.menu_ena_o(menu_ena), .menu_ctrl_o(menu_ctrl)
);
//instantiation of BTN
BTN BTN0(
.clk(clk_cpu), .rst(rst), .ena(btn_ena),
.din(btn), .dout(btn_data)
);
//instantiation of LED
LED LED0(
.clk(clk_cpu), .rst(rst), .ena(led_ena),
.din(led_din), .led(led)
);
//instantiation of TUBE
TUBE TUBE0(
.clk_u(clk_cpu), .clk_d(clk_tube), .clk_f(clk_flash),
.rst(rst), .ena(tube_ena), .din(tube_din), .dout(tube)
);
//instantiation of MENU
MENU MENU0(
.clk_u(clk_cpu), .clk_d(clk_vga), .clk_f(clk_flash),
.rst(rst), .ena(menu_ena), .ctrl(menu_ctrl), .vga(vga)
);
endmodule
其中,所有输入输出设备都是挂载在总线上的,并且各自分配了内存地址,使得CPU可以通过执行指向这些地址的L型指令和S型指令实现从输入设备获取输入和向输出设备输出的功能。本实验中的总线由于是片上的,无法使用三态门,也就无法在设计中赋高阻态,故这里的总线更接近数据选择器的效果,代码如下:
`include "define.v"
module BUS(
//data from and to CPU
input cpu_rom_ce_i,
input [`InstAddrBus] cpu_rom_addr_i,
output [`InstBus] cpu_rom_data_o,
input cpu_ram_ce_i,
input [`ByteSelBus] cpu_ram_we_i,
input [`DataAddrBus] cpu_ram_addr_i,
input [`DataBus] cpu_ram_din_i,
output reg [`DataBus] cpu_ram_dout_o,
//data from and to ROM
output rom_ce_o,
output [`RomAddrBus] rom_addr_o,
input [`InstBus] rom_data_i,
//data from and to RAM
output reg ram_ce_o,
output [`ByteSelBus] ram_we_o,
output [`RamAddrBus] ram_addr_o,
output [`DataBus] ram_din_o,
input [`DataBus] ram_dout_i,
//data from input
input [14:0] sw_i,
output reg [4:0] btn_ena_o,
input [4:0] btn_data_i,
//data to output
output reg led_ena_o,
output [`LedBus] led_din_o,
output reg tube_ena_o,
output [`TubeDinBus] tube_din_o,
output reg menu_ena_o,
output [1:0] menu_ctrl_o
);
//instruction bus
assign rom_ce_o = cpu_rom_ce_i;
assign rom_addr_o = cpu_rom_addr_i[`RomAddrBusSel];
assign cpu_rom_data_o = rom_data_i;
//data bus
always @(*)
begin
if(cpu_ram_ce_i == `ChipEnable) begin
ram_ce_o = `ChipDisable;
cpu_ram_dout_o = `ZeroWord;
btn_ena_o = {5{`ChipDisable}};
led_ena_o = `ChipDisable;
tube_ena_o = `ChipDisable;
menu_ena_o = `ChipDisable;
case(cpu_ram_addr_i[`DvcSelBusSel])
`DS_RAM: begin
ram_ce_o = `ChipEnable;
if(cpu_ram_we_i == {4{`WriteDisable}})
cpu_ram_dout_o = ram_dout_i;
else
cpu_ram_dout_o = `ZeroWord;
end
`DS_BTN: begin
btn_ena_o = cpu_ram_addr_i[4:0];
cpu_ram_dout_o = {27'b0, btn_data_i};
end
`DS_SW: begin
cpu_ram_dout_o = {17'b0, sw_i};
end
`DS_LED: begin
led_ena_o = `ChipEnable;
end
`DS_TUBE: begin
tube_ena_o = `ChipEnable;
end
`DS_VGA: begin
menu_ena_o = `ChipEnable;
end
default: begin
end
endcase
end else begin
ram_ce_o = `ChipDisable;
cpu_ram_dout_o = `ZeroWord;
btn_ena_o = {5{`ChipDisable}};
led_ena_o = `ChipDisable;
tube_ena_o = `ChipDisable;
menu_ena_o = `ChipDisable;
end
end
assign ram_we_o = cpu_ram_we_i;
assign ram_addr_o = cpu_ram_addr_i[`RamAddrBus];
assign ram_din_o = cpu_ram_din_i;
assign led_din_o = cpu_ram_din_i[`LedBus];
assign tube_din_o = cpu_ram_din_i[`TubeDinBus];
assign menu_ctrl_o = cpu_ram_din_i[1:0];
endmodule
我在以下部分详细解释这个CPU模块的不同部分:
clk
(时钟) 和 rst
(复位):这是数字电路的基本信号,用于控制操作的时序和初始化状态。rom_data_i
:从ROM接收的指令数据。rom_ce_o
:ROM的芯片使能信号,用于控制ROM的操作。rom_addr_o
:从CPU发送到ROM的地址信号,用于指定要读取的指令地址。ram_data_i
:从RAM接收的数据。ram_ce_o
:RAM的芯片使能信号。ram_we_o
:RAM的写使能信号,控制数据写入。ram_addr_o
:到RAM的地址信号。ram_data_o
:发送到RAM的数据。pc_reg
):clk
(时钟) 和 rst
(复位) 信号来同步和初始化。branch_flag
和 branch_target_address
用于处理分支指令,即当需要跳转到程序中的另一个地址时。stallreq
是一个控制信号,用于在特定情况下暂停PC的更新,如等待数据准备好或处理分支和跳转。pc
是当前指令的地址,它被发送到ROM以获取相应的指令数据。if_id
):if_pc
) 和 从ROM接收的指令数据 (if_inst
)。id_pc
) 和指令内容 (id_inst
)。stallreq
信号在此模块中也用于处理流水线暂停。id
):pc_i
) 和指令内容 (inst_i
)。reg1_read
, reg2_read
),以及提供要读取的寄存器地址 (reg1_addr
, reg2_addr
)。id_aluop_o
)、ALU操作选择 (id_alusel_o
)、操作数 (id_reg1_o
, id_reg2_o
) 和 写入寄存器标志 (id_wreg_o
)。branch_flag
) 和 目标地址 (branch_target_address
)。stop
):branch_flag_i
) 并输出停止信号 (stop_flag_o
)。regfile
):we
),写地址 (waddr
) 和 写数据 (wdata
)。re1
, re2
) 和 读地址 (raddr1
, raddr2
) 输出读取的数据 (rdata1
, rdata2
)。id_ex
):ex
):aluop_i
),操作选择 (alusel_i
),操作数 (reg1_i
, reg2_i
) 和 写使能信号 (wreg_i
)。wdata_o
),写目标寄存器地址 (wd_o
) 和 写使能标志 (wreg_o
)。ex_mem
):ex_wd
)、写使能 (ex_wreg
)、执行结果 (ex_wdata
)、用于内存操作的ALU操作码 (ex_aluop
)、内存地址 (ex_mem_addr
) 和 第二操作数 (ex_reg2
)。mem
):wreg_i
)、写目标寄存器地址 (wd_i
)、写数据 (wdata_i
)、ALU操作码 (aluop_i
)、内存地址 (mem_addr_i
) 和 第二操作数 (reg2_i
)。wreg_o
)、写目标寄存器地址 (wd_o
) 和 写数据 (wdata_o
)。mem_wb
):在内存访问阶段和写回阶段之间传递信息。
接收来自MEM模块的信号,如写目标寄存器地址 (mem_wd
)、写使能 (mem_wreg
) 和 写数据 (mem_wdata
)。
将这些信号传递到写回阶段,用于更新寄存器文件。
`include "define.v"
module CPU(
input wire clk,
input wire rst,
//data from and to ROM
input wire [`InstBus] rom_data_i,
output wire rom_ce_o,
output wire [`InstAddrBus] rom_addr_o,
//data from and to RAM
input wire [`DataBus] ram_data_i,
output wire ram_ce_o,
output wire [`ByteSelBus] ram_we_o,
output wire [`DataAddrBus] ram_addr_o,
output wire [`DataBus] ram_data_o
);
//connect pc with id
wire[`InstAddrBus] pc;
wire[`InstAddrBus] id_pc_i;
wire[`InstBus] id_inst_i;
wire branch_flag;
wire[`RegBus] branch_target_address;
//connect stop with id/ex
wire stop_flag;
//connect ex with id
wire stallreq;
wire[`AluSelBus] ex_id_alusel;
//connect id with id/ex
wire[`AluOpBus] id_aluop_o;
wire[`AluSelBus] id_alusel_o;
wire[`RegBus] id_reg1_o;
wire[`RegBus] id_reg2_o;
wire id_wreg_o;
wire[`RegAddrBus] id_wd_o;
wire[`RegBus] id_link_addr_o;
wire[`InstBus] id_inst_o;
//connect id/ex with ex
wire[`AluOpBus] ex_aluop_i;
wire[`AluSelBus] ex_alusel_i;
wire[`RegBus] ex_reg1_i;
wire[`RegBus] ex_reg2_i;
wire ex_wreg_i;
wire[`RegAddrBus] ex_wd_i;
wire[`RegBus] ex_link_addr_i;
wire[`InstBus] ex_inst_i;
//connect ex with ex/mem
wire ex_wreg_o;
wire[`RegAddrBus] ex_wd_o;
wire[`RegBus] ex_wdata_o;
wire[`AluOpBus] ex_aluop_o;
wire[`RegBus] ex_mem_addr_o;
wire[`RegBus] ex_reg2_o;
//connect ex/mem with mem
wire mem_wreg_i;
wire[`RegAddrBus] mem_wd_i;
wire[`RegBus] mem_wdata_i;
wire[`AluOpBus] mem_aluop_i;
wire[`RegBus] mem_mem_addr_i;
wire[`RegBus] mem_reg2_i;
//connect mem with RAM
wire[`RegBus] mem_mem_data_i;
//connect mem with mem/wb
wire mem_wreg_o;
wire[`RegAddrBus] mem_wd_o;
wire[`RegBus] mem_wdata_o;
wire[`RegBus] mem_mem_addr_o;
wire mem_mem_we_o;
wire[`ByteSelBus] mem_mem_sel_o;
wire[`RegBus] mem_mem_data_o;
wire mem_mem_ce_o;
//connect mem/wb with wb
wire wb_wreg_i;
wire[`RegAddrBus] wb_wd_i;
wire[`RegBus] wb_wdata_i;
//connect id with regfile
wire reg1_read;
wire reg2_read;
wire[`RegBus] reg1_data;
wire[`RegBus] reg2_data;
wire[`RegAddrBus] reg1_addr;
wire[`RegAddrBus] reg2_addr;
//realize PC
pc_reg pc_reg0(
.clk(clk), .rst(rst), .pc(pc), .ce(rom_ce_o),
.branch_flag_i(branch_flag), .branch_target_address_i(branch_target_address),
.stallreq(stallreq)
);
assign rom_addr_o = pc;
//realize IF/ID
if_id if_id0(
.clk(clk), .rst(rst), .if_pc(pc),
.if_inst(rom_data_i), .id_pc(id_pc_i),
.id_inst(id_inst_i),
.stallreq(stallreq)
);
//realize ID
id id0(
.rst(rst), .pc_i(id_pc_i), .inst_i(id_inst_i),
//data from regfile
.reg1_data_i(reg1_data), .reg2_data_i(reg2_data),
//data to regfile
.reg1_read_o(reg1_read), .reg2_read_o(reg2_read),
.reg1_addr_o(reg1_addr), .reg2_addr_o(reg2_addr),
//data to ex module
.aluop_o(id_aluop_o), .alusel_o(id_alusel_o),
.reg1_o(id_reg1_o), .reg2_o(id_reg2_o),
.wd_o(id_wd_o), .wreg_o(id_wreg_o),
.link_addr_o(id_link_addr_o),
.inst_o(id_inst_o),
//data from ex module
.ex_wdata_i(ex_wdata_o), .ex_wd_i(ex_wd_o), .ex_wreg_i(ex_wreg_o), .ex_alusel_i(ex_id_alusel),
//data from mem
.mem_wdata_i(mem_wdata_o), .mem_wd_i(mem_wd_o), .mem_wreg_i(mem_wreg_o),
.branch_flag_o(branch_flag),.branch_target_address_o(branch_target_address),
//signal to stop
.stallreq(stallreq),
.stop_flag_i(stop_flag)
);
//realize stop
stop stop0(.clk(clk), .rst(rst), .branch_flag_i(branch_flag), .stop_flag_o(stop_flag));
//realize Regfile
regfile regfile0(
.clk(clk), .rst(rst),
//write port
.we(wb_wreg_i), .waddr(wb_wd_i), .wdata(wb_wdata_i),
//read port1
.re1(reg1_read), .raddr1(reg1_addr), .rdata1(reg1_data),
//read port2
.re2(reg2_read), .raddr2(reg2_addr), .rdata2(reg2_data)
);
//realize ID/EX
id_ex id_ex0(
.clk(clk), .rst(rst),
//data from id
.id_aluop(id_aluop_o), .id_alusel(id_alusel_o),
.id_reg1(id_reg1_o), .id_reg2(id_reg2_o),
.id_wd(id_wd_o), .id_wreg(id_wreg_o),
.id_link_address(id_link_addr_o), .if_stop_i(stop_flag),
.id_inst(id_inst_o), .stallreq(stallreq),
//data to ex
.ex_aluop(ex_aluop_i), .ex_alusel(ex_alusel_i),
.ex_reg1(ex_reg1_i), .ex_reg2(ex_reg2_i),
.ex_wd(ex_wd_i), .ex_wreg(ex_wreg_i),
.ex_link_address(ex_link_addr_i),
.ex_inst(ex_inst_i)
);
//realize EX
ex ex0(
.rst(rst),
//data from id module
.aluop_i(ex_aluop_i), .alusel_i(ex_alusel_i),
.reg1_i(ex_reg1_i), .reg2_i(ex_reg2_i),
.wd_i(ex_wd_i), .wreg_i(ex_wreg_i),
.link_address_i(ex_link_addr_i),
.inst_i(ex_inst_i),
//data to id
.alusel_o(ex_id_alusel),
//Result of execution
.wd_o(ex_wd_o), .wreg_o(ex_wreg_o), .wdata_o(ex_wdata_o),
.aluop_o(ex_aluop_o), .mem_addr_o(ex_mem_addr_o), .reg2_o(ex_reg2_o)
);
//realize EX/MEM
ex_mem ex_mem0(
.clk(clk), .rst(rst),
//data from ex module
.ex_wd(ex_wd_o), .ex_wreg(ex_wreg_o), .ex_wdata(ex_wdata_o),
.ex_aluop(ex_aluop_o), .ex_mem_addr(ex_mem_addr_o),.ex_reg2(ex_reg2_o),
//data to mem module
.mem_wd(mem_wd_i), .mem_wreg(mem_wreg_i), .mem_wdata(mem_wdata_i),
.mem_aluop(mem_aluop_i), .mem_mem_addr(mem_mem_addr_i), .mem_reg2(mem_reg2_i)
);
//realize MEM
mem mem0(
.rst(rst),
//data from ex module
.wd_i(mem_wd_i), .wreg_i(mem_wreg_i), .wdata_i(mem_wdata_i),
.aluop_i(mem_aluop_i), .reg2_i(mem_reg2_i), .mem_addr_i(mem_mem_addr_i),
.mem_data_i(mem_mem_data_i),
//result
.wd_o(mem_wd_o), .wreg_o(mem_wreg_o), .wdata_o(mem_wdata_o),
.mem_addr_o(mem_mem_addr_o), .mem_sel_o(mem_mem_sel_o),
.mem_data_o(mem_mem_data_o), .mem_ce_o(mem_mem_ce_o)
);
//----------realize conection with RAM----------
assign mem_mem_data_i = ram_data_i;
assign ram_addr_o = mem_mem_addr_o;
assign ram_we_o = mem_mem_sel_o;
assign ram_ce_o = mem_mem_ce_o;
assign ram_data_o = mem_mem_data_o;
//realize MEM/WB
mem_wb mem_wb0(
.clk(clk), .rst(rst),
//data from mem module
.mem_wd(mem_wd_o), .mem_wreg(mem_wreg_o), .mem_wdata(mem_wdata_o),
//data to wb module
.wb_wd(wb_wd_i), .wb_wreg(wb_wreg_i), .wb_wdata(wb_wdata_i)
);
endmodule
由于流水线的工作模式限制,导致部分数据在写入寄存器之前就需要被读取,造成了数据冲突的问题。对于相隔两条指令的数据冲突,此时要写入的数据已经将向寄存器堆写入,但是由于寄存器写入操作位时序逻辑,所以会玩一个周期,所以在寄存器堆的相关代码中加入检测,如果某时刻要写入读取的寄存器地址相同,则直接将寄存器堆的输入作为输出。
//regfile module
if((raddr1 == waddr)&& (we == `WriteEnable)
&&(re1 == `ReadEnable)) begin //to avoid RAW instruction after 2 instructions
rdata1 = wdata;
end
而对于相隔一条指令和相邻指令的数据冲突问题,加入了ex和mem模块向id模块的数据前递。如果id模块是需要读取,ex或mem模块需要写回,同时二者所指向的寄存器地址相同时,则将要写入的数据直接作为id模块的reg信号输出。
//id module
if((reg2_read_o == `True_v)&&(ex_wreg_i ==`True_v)&&(ex_wd_i == reg2_addr_o)) begin
reg2_o = ex_wdata_i;
end else if((reg2_read_o == `True_v)&&(mem_wreg_i ==`True_v)&&(mem_wd_i == reg2_addr_o)) begin
reg2_o = mem_wdata_i;
end
但是对于紧跟着LOAD指令的指令,LOAD所要写入寄存器的数据必须在mem模块对RAM读取时才能产生。而紧跟LOAD指令的指令在被id模块处理时LOAD指令仍处于ex模块,必须进行流水线暂停。
//id module
assign pre_inst_is_load = (ex_alusel_i == `EXE_RES_LOAD)?`True_v:`False_v;。
...
if((pre_inst_is_load == `True_v)&&(reg1_read_o == `True_v)&&(ex_wd_i == reg1_addr_o)) begin
stallreq_for_reg1_loadrelate = `True_v;
if((pre_inst_is_load == `True_v)&&(reg2_read_o == `True_v)&&(ex_wd_i == reg2_addr_o)) begin
stallreq_for_reg2_loadrelate = `True_v;
assign stallreq = stallreq_for_reg2_loadrelate | stallreq_for_reg1_loadrelate;
pre_inst_is_load用于检测ex模块当前执行的指令是否为LOAD指令,同时如果当前id模块的指令需要读取寄存器且和LOAD指令要写入的寄存器地址相同,则说明发生了LOAD冲突,需要暂停流水线,stallreq 为 true。
此时将暂停PC计数,同时使if_id模块输出保持不变,id_ex模块输出空指令。在维持PC 和ID模块不动的情况下等待LOAD指令到达mem模块。
// pc_reg module
if(stallreq == `Stop) begin
pc <= pc;
end
//if_id module
if(stallreq == `Stop) begin
id_pc <= id_pc;
id_inst <= id_inst;
end
//id_ex module
if(stallreq == `Stop) begin
ex_aluop <= `EXE_NOP_OP;
ex_alusel <= `EXE_RES_NOP;
ex_reg1 <= `ZeroWord;
ex_reg2 <= `ZeroWord;
ex_wd <= `NOPRegAddr;
ex_wreg <= `WriteDisable;
ex_link_address <= `ZeroWord;
ex_inst <= `ZeroWord;
end
如果将B,J跳转指令集也放到ex模块处理,此时PC已经经过了两个周期时间,则会造成时间上的浪费。我们将对跳转指令的处理前移至id模块,这样只需要处理PC模块多读取的一条指令。我们加入了stop模块来记录上一条是否发生了跳转。
`include "define.v"
module stop(
input wire clk,
input wire rst,
input wire branch_flag_i,
input wire stallreq,
output reg stop_flag_o
);
always @(posedge clk)begin
if(rst == `RstEnable) begin
stop_flag_o <= 1'b0;
end if(stop_flag_o == `Stop) begin
stop_flag_o <= 1'b0;
end
else begin
stop_flag_o <= branch_flag_i & (~stallreq);
end
end
endmodule
而对于id_ex模块,如果之前发生了跳转指令,就输出空指令,相当于无视了id模块对下一条指令的处理,实现了流水线清理。
//id_ex module
if(if_stop_i == 1'b1) begin
ex_aluop <= `EXE_NOP_OP;
ex_alusel <= `EXE_RES_NOP;
ex_reg1 <= `ZeroWord;
ex_reg2 <= `ZeroWord;
ex_wd <= `NOPRegAddr;
ex_wreg <= `WriteDisable;
ex_link_address <= `ZeroWord;
ex_inst <= `ZeroWord;
上面的操作实现了跳转指令后的一条指令不会进入ex模块被处理,但是跳转指令的跳转信号是在id模块内被处理的,因此如果在发生跳转的跳转指令后一条的跳转指令是不需要跳转的。
//id moudule
if(stop_flag_i == `Stop) begin
branch_flag_o = `NotBranch;
branch_target_address_o =`ZeroWord;
end
如果LOAD指令后紧跟一条跳转指令,同时在LOAD信号存入寄存器前后都满足跳转条件,则被LOAD信号暂停了一个周期的跳转指令则会影响自己,导致跳转不发生。修改stop模块来处理该情况。
//stop module
stop_flag_o <= branch_flag_i & (~stallreq);
我在以下部分讲述一下我们对于CPU MODULE 的各条指令的仿真验证,并给出代表性的波形图。
模块声明与CPU接口: 这段代码中定义的名为tb_CPU
的模块是用来测试CPU。它包含与CPU模块相关的输入和输出接口。例如,clk
(时钟信号)、rst
(复位信号)、rom_data_i
(从ROM读取的数据输入)、ram_data_i
(从RAM读取的数据输入)等作为输入;而rom_ce_o
(ROM的片选输出)、rom_addr_o
(ROM的地址输出)、ram_ce_o
(RAM的片选输出)等作为输出。
注:其中#6的延迟是为了在rom_ce_o有效以后的第一个时钟有效延过1ns的时刻给出rom_data_i的指令。
#6 //posedge trigger
`include "../../sources_1/new/define.v"
`timescale 1ns / 1ps
module tb_CPU;
// CPU Inputs
reg clk = 0 ;
reg rst = 0 ;
reg [`InstBus] rom_data_i = 0 ;
reg [`DataBus] ram_data_i = 0 ;
// CPU Outputs
wire rom_ce_o ;
wire [`InstAddrBus] rom_addr_o ;
wire ram_ce_o ;
wire [`ByteSelBus] ram_we_o ;
wire [`DataAddrBus] ram_addr_o ;
wire [`DataBus] ram_data_o ;
always #5 clk=~clk;
CPU u_CPU (
.clk ( clk ),
.rst ( rst ),
.rom_data_i ( rom_data_i ),
.ram_data_i ( ram_data_i ),
.rom_ce_o ( rom_ce_o ),
.rom_addr_o ( rom_addr_o ),
.ram_ce_o ( ram_ce_o ),
.ram_we_o ( ram_we_o ),
.ram_addr_o ( ram_addr_o ),
.ram_data_o ( ram_data_o )
);
initial
begin
#20 rst = 1;
#20 rst = 0;
#6 //posedge trigger
//----------testbench for L instruction----------
/*
//-----------testbench for load byte instruction----------
ram_data_i = 32'b00000001_00000010_00000100_00001000;
rom_data_i = 32'b1111111_11111_00000_000_00001_0010011;//addi x1,x0,-1
#10
rom_data_i = 32'b0000000_00010_00001_000_00011_0000011;//lb x3,1,x1
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
//LB instruction succeed
*/
/*
//-----------testbench for LH instruction----------
ram_data_i = 32'b00000001_00000010_00000100_00001000;
rom_data_i = 32'b00000000000000000000000010010011;//addi x1,x0,0
#10
rom_data_i = 32'b00000000010000001001000110000011;//lh x3,4,x1
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
//LH instruction succeed
*/
//-----------testbench for LW instruction---------- jyk have detected before
//LW instruction succeed
/*
//----------testbench for S instruction
////-----------testbench for SB instruction----------
rom_data_i = 32'b11111111111100000000000010010011;//addi x1,x0,-1
#10
rom_data_i = 32'b 00000000000100000000001000100011;//sb x1,4,x0
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
//S instruction succeed
*/
//----------testbench for I instruction
/*----------addi and ori succeed-----------
rom_data_i = 32'b0000000_00001_00010_000_00011_0010011; //addi x3, x2, 1
#10
rom_data_i = 32'b0000000_00001_00011_110_00100_0110011; //or x4, x3, x1
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#70
*/
//xori should be successful
/*----------slti succeed ----------
rom_data_i = 32'b11111111111100000000000010010011;//addi x1,x0,-1
#10
rom_data_i = 32'b00000000001000001010000100010011;//slti x2,x1,2
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
*/
/*----------sltiu succeed----------
rom_data_i = 32'b11111111111100000000000010010011;//addi x1,x0,-1
#10
rom_data_i = 32'b00000000001000001011000100010011;//sltiu x2,x1,2
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
*/
/*----------slli succeed----------
rom_data_i = 32'b11111111111100000000000010010011;//addi x1,x0,-1
#10
rom_data_i = 32'b00000000000100001001000100010011;//slli x2,x1,1
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
*/
//srli should be successful
/*----------srai succeed----------
rom_data_i = 32'b11111111111100000000000010010011;//addi x1,x0,-1
#10
rom_data_i = 32'b01000000000100001101000100010011;//srai x2,x1,1
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
*/
//testbench for R instruction
/*
//add instruction and sub instruction succeed
rom_data_i = 32'b11111111111100000000000010010011;//addi x1,x0,-1
#10
rom_data_i = 32'b00000000001100001000000100010011;//addi x2,x1,3
#10
rom_data_i = 32'b0000000_00010_00001_000_00011_0110011;//add x3,x1,x2
#10
rom_data_i = 32'b0100000_00001_00010_000_00100_0110011;//sub x4,x2,x1
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
//the rest of R instruction have not been tested.
*/
/*
//testbench for LUI instruction ---succeed
rom_data_i = 32'b00000000000000000001000010110111;//lui x1,1
#10
rom_data_i = 32'b00000000000011111111000100110111;//lui x2,255
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
*/
/*
//-----------testbench for AUIPC instruction
// it's correct
rom_data_i = 32'b00000000000000000000000010010111;//auipc x1,0
#10
rom_data_i = 32'b00000000000000000000000100010111;//auipc x2,0
#10
rom_data_i = 32'b00000000000000000000000110010111;//auipc x3,0
#10
rom_data_i = 32'b00000000000000000100001000010111;//auipc x4,4
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
*/
//----------testbench for J instruction
/*
// JAL instruction succeed
rom_data_i = 32'b00000000000000000000000100010111;//auipc x2,0
#10
rom_data_i = 32'b00000000000000000000000110010111;//auipc x3,0
#10
rom_data_i = 32'b00000000000000000100001000010111;//auipc x4,4
#10
rom_data_i = 32'b11111111010111111111000011101111;//jal x1,-12
#10
rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
#50
*/
// JALR instruction
//rom_data_i = 32'b00000000100000000000000010010011;//addi x1,x0,8
//#10
/*
rom_data_i = 32'b00000000000000000000000100010111;//auipc x2,0
#10
rom_data_i = 32'b00000000000000000000000110010111;//auipc x3,0
#10
rom_data_i = 32'b00000000000000000100001000010111;//auipc x4,4
#10
rom_data_i = 32'b00000000010000001000001011100111;//jalr x5,x1,4
#10
*/
//rom_data_i = 32'b0000000_00000_00000_000_00000_0000000; //nop
//#50
$finish;
end
endmodule
可以发现rom_addr_o的输出从32‘h00000014转变成了32’h0000000c,可见JAL指令实现了无条件跳转;此外可以发现regfile[5]变成了32'h00000014,可知JAL指令也完成了写入操作。综上所述,可知JAL指令实现成功。
对顶层模块的仿真可以测试CPU与外围电路的交互。一段示例tb代码如下:
`timescale 1ns / 1ps
module tb_top_module;
// top_module Inputs
reg clk = 0 ;
reg rst = 0 ;
// top_module Outputs
reg [4:0] btn = 5'b0;
reg [14:0] sw = 15'b0;
wire [15:0] led;
wire [11:0] tube;
wire [13:0] vga;
always #5 clk=~clk;
top_module u_top_module (
.clk ( clk ),
.rst ( rst ),
.btn ( btn ),
.sw ( sw ),
.led ( led ),
.tube ( tube ),
.vga ( vga )
);
initial
begin
rst = 1;
#3000 rst = 0; //It requires about 2700ns for CLK_GEN to get prepared
#7000 btn = 5'b00100; //Down button
#10000 btn = 5'b00000;
#5000 btn = 5'b10000; //Central button
#10000 btn = 5'b00000;
#5000 btn = 5'b01000; //Up button
#10000 btn = 5'b00000;
#30000
$finish;
end
endmodule
该示例tb代码旨在测试按钮输入能否被CPU正确读取,仿真结果如下:
可以看到,当按下下按钮(btn从5'h0变为5'h4)时,btn_data会一直保持5'h4,直至该按钮被CPU读取(光标处),即读取内存地址为32'h00040004的数据,btn_ena的该位被置1,表明按钮信号已被读取,后一个周期btn_data就回到了5'h0。这段仿真结果显示了按钮功能的正确性,既保证按钮按下能被CPU读取,又不会被识别为按下了多次。具体实现可参考工程内的btn_proc.v。
此外,在对顶层模块的仿真中还能观察到一些时序上的特性,下面详细介绍。
首先,考虑到BRAM的输出并非是组合逻辑的,而是会在下一个时钟上升沿到来时输出,为保证取指和访存阶段分别能够在当阶段拿到来自ROM和RAM的数据,我们设置ROM和RAM的时钟频率两倍于CPU频率。因而在上面的仿真结果中,可以看到cpu_rom_data和真正来自RAM的cpu_ram_data都是比地址请求延后了半个时钟周期。
基于此,经测试可以得到不违反时序要求的最高CPU频率是37MHz,以下是时序报告中的关键路径:
在验收时,助教点明了时序优化的两个努力方向:
一方面,可以向EDA工具说明存储器时钟与CPU时钟的关系,以降低时序报告中的要求;另一方面,可以用数据前馈以外的方法解决数据冲突,以缩短关键路径。
#include <stdio.h>
int main()
{
int *led_data = (int *)0x00080000;//分配给LED的地址
int *tube_data = (int *)0x00090000;//分配给数码管的地址
int *vga_data = (int *)0x000A0000;//分配给VGA信号的地址
int *sw_data = (int *)0x00050000;//分配给开关的地址
int *btn_data5 = (int *)0x00040010;//分配给五个按钮的地址
int *btn_data4 = (int *)0x00040008;
int *btn_data3 = (int *)0x00040004;
int *btn_data2 = (int *)0x00040002;
int *btn_data1 = (int *)0x00040001;
int sum;//一些中间变量
int flag;
int temp;
int vga = 0;
int tube = 0;
int led_temp1 = 0, led_temp2 = 0, led = 0;
int num[4] = {0, 0, 0, 0};
int max = 0;
int flash;
*led_data = 0;
*tube_data = 0;
*vga_data = 0;
while (!((*btn_data5 >> 4) << 31))//如果中间按钮未被按下则一直循环
{
*tube_data = 0x0f008888;
if ((*btn_data4 >> 3) << 31)
vga = vga & 0xfffffffd;//上按钮被按下则控制vga信号输出框在上
else if ((*btn_data3 >> 2) << 31)
vga = vga | 0x00000002;//下按钮被按下则控制vga信号输出框在下
*vga_data = vga;
}
*vga_data = vga | 1;//上按钮按下后框不再闪烁
if (vga / 2 % 2)
{
flash = 0;
*tube_data = 0x0f100000;//初始最右边数字在闪烁,数字均为0
while (!((*btn_data5 >> 4) << 31))
{
if ((*btn_data4 >> 3) << 31)//上按钮被按下时闪烁数字循环加1
{
num[flash] = num[flash] + 1;
if (num[flash] == 10)
num[flash] = 0;
} // 上
else if ((*btn_data3 >> 2) << 31)//下按钮被按下时闪烁数字循环减1
{
if (num[flash] == 0)
num[flash] = 10;
num[flash] = num[flash] - 1;
} // 下
else if ((*btn_data2 >> 1) << 31)
{
flash = (flash + 1) % 4;//左按钮被按下时闪烁数字循环左移一位
} // 左
else if ((*btn_data1) << 31)//右按钮被按下时闪烁数字循环右移一位
{
flash = (flash + 3) % 4;
} // 右
tube = 0x0f000000 + ((1 << (flash + 20))) + (num[3] << (12)) + (num[2] << (8)) + (num[1] << (4)) + num[0];
*tube_data = tube;//将临时变量存储的数据转化为CPU内部格式输出
}
*tube_data = tube & 0xff0fffff;//停止闪烁
max = (num[3] << 9) + (num[3] << 8) + (num[3] << 7) + (num[3] << 6) + (num[3] << 5) + (num[3] << 3) + (num[2] << 6) + (num[2] << 5) + (num[2] << 2) + (num[1] << 3) + (num[1] << 1) + num[0];
//计算输入的数值
sum = 0;//统计质数数量
for (int i = 2; i <= max; i++)//遍历比该数小的所有数
{
flag = 1;//flag为1说明是质数
for (int j = 2; j < i; j++)//遍历所有可能的因数
{
temp = i;
while (temp >= j)
{
temp = temp - j;
}//取模
if (temp == 0)
{
flag = 0;
break;
}
}
if (flag == 1)
sum = sum + 1;
}
num[3] = 0;
num[2] = 0;
num[1] = 0;
while (sum >= 1000)
{
num[3] = num[3] + 1;
sum = sum - 1000;
}
while (sum >= 100)
{
num[2] = num[2] + 1;
sum = sum - 100;
}
while (sum >= 10)
{
num[1] = num[1] + 1;
sum = sum - 10;
}
num[0] = sum;//将四位数拆分为4个一位数
*tube_data = 0x01000000+((1 && num[3]) << 27)+((num[3]||num[2])<< 26)+((num[3]||num[2]||num[1])<< 25)+(num[3] << 12) + (num[2] << 8) + (num[1] << 4) + num[0];
while (1)
{
}//避免代码回到开头
}
else
{
led = *sw_data & (~*sw_data + 1);
*led_data = led;
while (1)
{
for (int i = 0; i < 1000000; i++)
; // delay
led_temp1 = led ? (((led) << 1) - 1) : 0;//产生一个灯亮的位置和其右边所有位置均为1的信号
led_temp2 = *sw_data & (~led_temp1);//与开关信号相与,使亮过的灯右边的灯不会再亮
led = led_temp2 & (~led_temp2 + 1);//一个数与其补码相与可以取出最右一位1
*led_data = led;
}
}
return 0;
}
分为以下几个步骤:
用riscv官方编译器进行编译,将.o文件转换为.s文件;
riscv64-unknown-elf-gcc -march=rv32i -mabi=ilp32 -S example.c -o example.s
修改.s文件,在第一条指令前加入一条lui sp,4
指令,让栈指针指向RAM的存储空间的中心位置;
用riscv官方编译器进行编译汇编,将.s文件转换为.o文件;
riscv64-unknown-elf-gcc -march=rv32i -mabi=ilp32 -c example.s -o example.o
用python上下的库,将.o文件转换为.coe文件;
bin2coe -i example.o -w 32 -o example.coe
修改.coe文件,删去.o文件中开头非指令部分转换得到的部分,即指令00004137
前的;
修改ROM的ip核的设置,用该.coe文件初始化ROM。
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。