aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorClifford Wolf <clifford@clifford.at>2015-06-07 08:28:10 +0200
committerClifford Wolf <clifford@clifford.at>2015-06-07 11:49:47 +0200
commite84f044bc5e740c880ae547e84c3f3a0fe424f51 (patch)
tree1b5c3b0519d93eb23a3104e7519880dd4929bf3c
parent491cd5e15dc3f96d177493d2e23edc356dcc648e (diff)
downloadpicorv32-e84f044bc5e740c880ae547e84c3f3a0fe424f51.tar.gz
picorv32-e84f044bc5e740c880ae547e84c3f3a0fe424f51.zip
Major redesign of main FSM
-rw-r--r--.gitignore32
-rw-r--r--README.md40
-rw-r--r--dhrystone/Makefile10
-rw-r--r--dhrystone/start.S10
-rw-r--r--dhrystone/testbench.v9
-rw-r--r--firmware/start.S1
-rw-r--r--picorv32.v435
-rw-r--r--tests/fence_i.S53
8 files changed, 287 insertions, 303 deletions
diff --git a/.gitignore b/.gitignore
index c3a91c8..528f8cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,14 +1,18 @@
-tests/*.o
-firmware/firmware.bin
-firmware/firmware.elf
-firmware/firmware.hex
-firmware/firmware.map
-testbench.exe
-testbench_axi.exe
-testbench.vcd
-dhrystone/dhry.bin
-dhrystone/dhry.elf
-dhrystone/dhry.hex
-dhrystone/dhry.map
-dhrystone/*.d
-dhrystone/*.o
+/tests/*.o
+/firmware/firmware.bin
+/firmware/firmware.elf
+/firmware/firmware.hex
+/firmware/firmware.map
+/dhrystone/dhry.bin
+/dhrystone/dhry.elf
+/dhrystone/dhry.hex
+/dhrystone/dhry.map
+/dhrystone/testbench.exe
+/dhrystone/testbench.vcd
+/dhrystone/timing.exe
+/dhrystone/timing.txt
+/dhrystone/*.d
+/dhrystone/*.o
+/testbench.exe
+/testbench_axi.exe
+/testbench.vcd
diff --git a/README.md b/README.md
index 0e9e4e7..a215843 100644
--- a/README.md
+++ b/README.md
@@ -46,26 +46,26 @@ interface, and communicating with the outside world via AXI4.
Performance:
------------
-The average Cycles per Instruction (CPI) is 5 to 7, depending on the
-mix of instructions in the code. The CPI for the individual instructions is:
-
-| Instruction | CPI |
-| ---------------------| ---:|
-| direct jump (jal) | 4 |
-| ALU reg + immediate | 4 |
-| ALU reg + reg | 5 |
-| branch (not taken) | 5 |
-| memory load | 7 |
-| memory store | 8 |
-| branch (taken) | 8 |
-| indirect jump (jalr) | 8 |
-| shift operations | 5+ |
-
-Dhrystone benchmark results: 0.215 DMIPS/MHz (379 Dhrystones/Second/MHz)
-
-For the Dryhstone benchmark the average CPI is 5.983.
-
-*This numbers apply for setups with memory that can accomodate requests within
+The average Cycles per Instruction (CPI) is 4 to 6, depending on the mix of
+instructions in the code. The CPI numbers for the individual instructions are:
+
+| Instruction | CPI |
+| ---------------------| ----:|
+| direct jump (jal) | 3 |
+| ALU reg + immediate | 3 |
+| ALU reg + reg | 4 |
+| branch (not taken) | 4 |
+| memory load | 5 |
+| memory store | 6 |
+| branch (taken) | 6 |
+| indirect jump (jalr) | 6 |
+| shift operations | 4-15 |
+
+Dhrystone benchmark results: 0.280 DMIPS/MHz (493 Dhrystones/Second/MHz)
+
+For the Dryhstone benchmark the average CPI is 4.606.
+
+*This numbers apply to systems with memory that can accomodate requests within
one clock cycle. Slower memory will degrade the performance of the processor.*
diff --git a/dhrystone/Makefile b/dhrystone/Makefile
index 412ffda..7e077b1 100644
--- a/dhrystone/Makefile
+++ b/dhrystone/Makefile
@@ -5,10 +5,18 @@ CFLAGS = -MD -O3 -m32 -march=RV32I -ffreestanding -nostdlib -DTIME -DRISCV
test: testbench.exe dhry.hex
vvp -N testbench.exe
+timing: timing.exe dhry.hex
+ vvp -N timing.exe > timing.txt
+ sed 's,.*## ,,' timing.txt | gawk 'x != "" {print x,$$2-y;} {x=$$1;y=$$2;}' | sort | uniq -c | sort -k3 -n
+
testbench.exe: testbench.v ../picorv32.v
iverilog -o testbench.exe testbench.v ../picorv32.v
chmod -x testbench.exe
+timing.exe: testbench.v ../picorv32.v
+ iverilog -o timing.exe -DTIMING testbench.v ../picorv32.v
+ chmod -x timing.exe
+
dhry.hex: dhry.bin ../firmware/makehex.py
python3 ../firmware/makehex.py $< > $@
@@ -27,7 +35,7 @@ dhry.elf: $(OBJS) ../firmware/sections.lds
riscv64-unknown-elf-gcc -c $(CFLAGS) $<
clean:
- rm -rf *.o *.d dhry.elf dhry.map dhry.bin dhry.hex testbench.exe testbench.vcd
+ rm -rf *.o *.d dhry.elf dhry.map dhry.bin dhry.hex testbench.exe testbench.vcd timing.exe timing.txt
.PHONY: test clean
diff --git a/dhrystone/start.S b/dhrystone/start.S
index 202727c..092ba96 100644
--- a/dhrystone/start.S
+++ b/dhrystone/start.S
@@ -17,6 +17,16 @@ start:
sw a2,0(a0)
sw a5,0(a0)
+ /* execute some insns for "make timing" */
+ lui a0,0
+ auipc a0,0
+ slli a0,a0,0
+ slli a0,a0,31
+ addi a1,zero,0
+ sll a0,a0,a1
+ addi a1,zero,31
+ sll a0,a0,a1
+
/* set stack pointer */
lui sp,(64*1024)>>12
diff --git a/dhrystone/testbench.v b/dhrystone/testbench.v
index c14779e..bfa9fbc 100644
--- a/dhrystone/testbench.v
+++ b/dhrystone/testbench.v
@@ -43,11 +43,12 @@ module testbench;
assign mem_ready = 1;
always @(posedge clk) begin
- mem_rdata <= mem_la_read ? memory[mem_la_addr >> 2] : 'bx;
+ if (mem_la_read)
+ mem_rdata <= memory[mem_la_addr >> 2];
if (mem_valid) begin
case (mem_addr)
32'h1000_0000: begin
-`ifndef INSN_TIMING
+`ifndef TIMING
$write("%c", mem_wdata);
$fflush();
`endif
@@ -75,14 +76,12 @@ module testbench;
end
end
-`ifdef INSN_TIMING
+`ifdef TIMING
initial begin
repeat (100000) @(posedge clk);
$finish;
end
always @(uut.count_instr[0]) begin
- // iverilog -DINSN_TIMING testbench.v ../picorv32.v && ./a.out > x
- // sed 's,.*## ,,' x | gawk 'x != "" {print x,$2-y;} {x=$1;y=$2;}' | sort | uniq -c | sort -k3 -n
$display("## %-s %d", uut.instruction, uut.count_cycle);
end
`endif
diff --git a/firmware/start.S b/firmware/start.S
index 34e44d8..d9e0b3e 100644
--- a/firmware/start.S
+++ b/firmware/start.S
@@ -50,7 +50,6 @@ start:
TEST(or)
TEST(and)
- TEST(fence_i)
TEST(simple)
/* set stack pointer */
diff --git a/picorv32.v b/picorv32.v
index b84ce63..f84897a 100644
--- a/picorv32.v
+++ b/picorv32.v
@@ -49,12 +49,11 @@ module picorv32 #(
localparam integer regindex_bits = ENABLE_REGS_16_31 ? 5 : 4;
reg [63:0] count_cycle, count_instr;
- reg [31:0] reg_pc, reg_op1, reg_op2, reg_out;
+ reg [31:0] reg_pc, reg_next_pc, reg_op1, reg_op2, reg_out, reg_alu_out;
reg [31:0] cpuregs [0:regfile_size-1];
reg [4:0] reg_sh;
- wire reg_out_0 = reg_out[0];
-
+ wire [31:0] next_pc;
// Memory Interface
@@ -65,24 +64,47 @@ module picorv32 #(
reg mem_do_rinst;
reg mem_do_rdata;
reg mem_do_wdata;
- reg mem_done;
wire mem_busy = |{mem_do_prefetch, mem_do_rinst, mem_do_rdata, mem_do_wdata};
+ wire mem_done = mem_ready && ((mem_state[0] && (mem_do_rinst || mem_do_rdata)) || mem_state == 2);
+
assign mem_la_read = resetn && !mem_state && (mem_do_rinst || mem_do_prefetch || mem_do_rdata);
- assign mem_la_addr = mem_do_prefetch ? reg_pc + 4 : mem_do_rinst ? reg_pc : {reg_op1[31:2], 2'b00};
+ assign mem_la_addr = mem_do_prefetch || mem_do_rinst ? next_pc : {reg_op1[31:2], 2'b00};
+
+ always @* begin
+ (* full_case *)
+ case (mem_wordsize)
+ 0: begin
+ mem_buffer = mem_rdata;
+ end
+ 1: begin
+ case (reg_op1[1])
+ 1'b0: mem_buffer = mem_rdata[15: 0];
+ 1'b1: mem_buffer = mem_rdata[31:16];
+ endcase
+ end
+ 2: begin
+ case (reg_op1[1:0])
+ 2'b00: mem_buffer = mem_rdata[ 7: 0];
+ 2'b01: mem_buffer = mem_rdata[15: 8];
+ 2'b10: mem_buffer = mem_rdata[23:16];
+ 2'b11: mem_buffer = mem_rdata[31:24];
+ endcase
+ end
+ endcase
+ end
always @(posedge clk) begin
- mem_done <= 0;
if (!resetn) begin
mem_state <= 0;
mem_valid <= 0;
end else case (mem_state)
0: begin
mem_addr <= mem_la_addr;
- if (mem_do_rinst || mem_do_prefetch || mem_do_rdata) begin
+ if (mem_do_prefetch || mem_do_rinst || mem_do_rdata) begin
mem_valid <= 1;
- mem_instr <= mem_do_rinst || mem_do_rdata;
+ mem_instr <= mem_do_prefetch || mem_do_rinst;
mem_wstrb <= 0;
mem_state <= 1;
end
@@ -110,43 +132,20 @@ module picorv32 #(
end
1: begin
if (mem_ready) begin
- (* full_case *)
- case (mem_wordsize)
- 0: begin
- mem_buffer <= mem_rdata;
- end
- 1: begin
- case (reg_op1[1])
- 1'b0: mem_buffer <= mem_rdata[15: 0];
- 1'b1: mem_buffer <= mem_rdata[31:16];
- endcase
- end
- 2: begin
- case (reg_op1[1:0])
- 2'b00: mem_buffer <= mem_rdata[ 7: 0];
- 2'b01: mem_buffer <= mem_rdata[15: 8];
- 2'b10: mem_buffer <= mem_rdata[23:16];
- 2'b11: mem_buffer <= mem_rdata[31:24];
- endcase
- end
- endcase
mem_valid <= 0;
- mem_state <= 3;
- mem_done <= mem_do_rinst || mem_do_rdata;
+ mem_state <= mem_do_rinst || mem_do_rdata ? 0 : 3;
end
end
2: begin
if (mem_ready) begin
mem_valid <= 0;
- mem_state <= 3;
- mem_done <= 1;
+ mem_state <= 0;
end
end
3: begin
- if (mem_done)
+ if (mem_do_rinst) begin
mem_state <= 0;
- else if (mem_do_rinst || mem_do_rdata)
- mem_done <= 1;
+ end
end
endcase
end
@@ -159,7 +158,7 @@ module picorv32 #(
reg instr_lb, instr_lh, instr_lw, instr_lbu, instr_lhu, instr_sb, instr_sh, instr_sw;
reg instr_addi, instr_slti, instr_sltiu, instr_xori, instr_ori, instr_andi, instr_slli, instr_srli, instr_srai;
reg instr_add, instr_sub, instr_sll, instr_slt, instr_sltu, instr_xor, instr_srl, instr_sra, instr_or, instr_and;
- reg instr_fence, instr_rdcycle, instr_rdcycleh, instr_rdinstr, instr_rdinstrh;
+ reg instr_rdcycle, instr_rdcycleh, instr_rdinstr, instr_rdinstrh;
wire instr_trap;
reg [regindex_bits-1:0] decoded_rd, decoded_rs1, decoded_rs2;
@@ -167,7 +166,7 @@ module picorv32 #(
reg decoder_trigger;
wire [31:0] decoded_imm_uj;
- assign { decoded_imm_uj[31:20], decoded_imm_uj[10:1], decoded_imm_uj[11], decoded_imm_uj[19:12], decoded_imm_uj[0] } = $signed({mem_buffer[31:12], 1'b0});
+ assign { decoded_imm_uj[31:20], decoded_imm_uj[10:1], decoded_imm_uj[11], decoded_imm_uj[19:12], decoded_imm_uj[0] } = $signed({mem_rdata[31:12], 1'b0});
reg is_lui_auipc_jal;
reg is_lb_lh_lw_lbu_lhu;
@@ -200,7 +199,7 @@ module picorv32 #(
instr_lb, instr_lh, instr_lw, instr_lbu, instr_lhu, instr_sb, instr_sh, instr_sw,
instr_addi, instr_slti, instr_sltiu, instr_xori, instr_ori, instr_andi, instr_slli, instr_srli, instr_srai,
instr_add, instr_sub, instr_sll, instr_slt, instr_sltu, instr_xor, instr_srl, instr_sra, instr_or, instr_and,
- instr_fence, instr_rdcycle, instr_rdcycleh, instr_rdinstr, instr_rdinstrh};
+ instr_rdcycle, instr_rdcycleh, instr_rdinstr, instr_rdinstrh};
reg [63:0] instruction;
@@ -248,7 +247,6 @@ module picorv32 #(
if (instr_or) instruction = "or";
if (instr_and) instruction = "and";
- if (instr_fence) instruction = "fence";
if (instr_rdcycle) instruction = "rdcycle";
if (instr_rdcycleh) instruction = "rdcycleh";
if (instr_rdinstr) instruction = "rdinstr";
@@ -259,64 +257,61 @@ module picorv32 #(
decoder_trigger <= 0;
if (mem_do_rinst && mem_done) begin
- instr_lui <= mem_buffer[6:0] == 7'b0110111;
- instr_auipc <= mem_buffer[6:0] == 7'b0010111;
-
- instr_jal <= mem_buffer[6:0] == 7'b1101111;
- instr_jalr <= mem_buffer[6:0] == 7'b1100111;
-
- instr_beq <= mem_buffer[6:0] == 7'b1100011 && mem_buffer[14:12] == 3'b000;
- instr_bne <= mem_buffer[6:0] == 7'b1100011 && mem_buffer[14:12] == 3'b001;
- instr_blt <= mem_buffer[6:0] == 7'b1100011 && mem_buffer[14:12] == 3'b100;
- instr_bge <= mem_buffer[6:0] == 7'b1100011 && mem_buffer[14:12] == 3'b101;
- instr_bltu <= mem_buffer[6:0] == 7'b1100011 && mem_buffer[14:12] == 3'b110;
- instr_bgeu <= mem_buffer[6:0] == 7'b1100011 && mem_buffer[14:12] == 3'b111;
-
- instr_lb <= mem_buffer[6:0] == 7'b0000011 && mem_buffer[14:12] == 3'b000;
- instr_lh <= mem_buffer[6:0] == 7'b0000011 && mem_buffer[14:12] == 3'b001;
- instr_lw <= mem_buffer[6:0] == 7'b0000011 && mem_buffer[14:12] == 3'b010;
- instr_lbu <= mem_buffer[6:0] == 7'b0000011 && mem_buffer[14:12] == 3'b100;
- instr_lhu <= mem_buffer[6:0] == 7'b0000011 && mem_buffer[14:12] == 3'b101;
-
- instr_sb <= mem_buffer[6:0] == 7'b0100011 && mem_buffer[14:12] == 3'b000;
- instr_sh <= mem_buffer[6:0] == 7'b0100011 && mem_buffer[14:12] == 3'b001;
- instr_sw <= mem_buffer[6:0] == 7'b0100011 && mem_buffer[14:12] == 3'b010;
-
- instr_addi <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b000;
- instr_slti <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b010;
- instr_sltiu <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b011;
- instr_xori <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b100;
- instr_ori <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b110;
- instr_andi <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b111;
-
- instr_slli <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b001 && mem_buffer[31:25] == 7'b0000000;
- instr_srli <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b101 && mem_buffer[31:25] == 7'b0000000;
- instr_srai <= mem_buffer[6:0] == 7'b0010011 && mem_buffer[14:12] == 3'b101 && mem_buffer[31:25] == 7'b0100000;
-
- instr_add <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b000 && mem_buffer[31:25] == 7'b0000000;
- instr_sub <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b000 && mem_buffer[31:25] == 7'b0100000;
- instr_sll <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b001 && mem_buffer[31:25] == 7'b0000000;
- instr_slt <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b010 && mem_buffer[31:25] == 7'b0000000;
- instr_sltu <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b011 && mem_buffer[31:25] == 7'b0000000;
- instr_xor <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b100 && mem_buffer[31:25] == 7'b0000000;
- instr_srl <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b101 && mem_buffer[31:25] == 7'b0000000;
- instr_sra <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b101 && mem_buffer[31:25] == 7'b0100000;
- instr_or <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b110 && mem_buffer[31:25] == 7'b0000000;
- instr_and <= mem_buffer[6:0] == 7'b0110011 && mem_buffer[14:12] == 3'b111 && mem_buffer[31:25] == 7'b0000000;
-
- instr_fence <= (mem_buffer[6:0] == 7'b0001111 && mem_buffer[19:12] == 0 && mem_buffer[31:28] == 4'b0000) ||
- (mem_buffer[6:0] == 7'b0001111 && mem_buffer[31:12] == 1);
-
- instr_rdcycle <= ((mem_buffer[6:0] == 7'b1110011 && mem_buffer[31:12] == 'b11000000000000000010) ||
- (mem_buffer[6:0] == 7'b1110011 && mem_buffer[31:12] == 'b11000000000100000010)) && ENABLE_COUNTERS;
- instr_rdcycleh <= ((mem_buffer[6:0] == 7'b1110011 && mem_buffer[31:12] == 'b11001000000000000010) ||
- (mem_buffer[6:0] == 7'b1110011 && mem_buffer[31:12] == 'b11001000000100000010)) && ENABLE_COUNTERS;
- instr_rdinstr <= (mem_buffer[6:0] == 7'b1110011 && mem_buffer[31:12] == 'b11000000001000000010) && ENABLE_COUNTERS;
- instr_rdinstrh <= (mem_buffer[6:0] == 7'b1110011 && mem_buffer[31:12] == 'b11001000001000000010) && ENABLE_COUNTERS;
-
- decoded_rd <= mem_buffer[11:7];
- decoded_rs1 <= mem_buffer[19:15];
- decoded_rs2 <= mem_buffer[24:20];
+ instr_lui <= mem_rdata[6:0] == 7'b0110111;
+ instr_auipc <= mem_rdata[6:0] == 7'b0010111;
+
+ instr_jal <= mem_rdata[6:0] == 7'b1101111;
+ instr_jalr <= mem_rdata[6:0] == 7'b1100111;
+
+ instr_beq <= mem_rdata[6:0] == 7'b1100011 && mem_rdata[14:12] == 3'b000;
+ instr_bne <= mem_rdata[6:0] == 7'b1100011 && mem_rdata[14:12] == 3'b001;
+ instr_blt <= mem_rdata[6:0] == 7'b1100011 && mem_rdata[14:12] == 3'b100;
+ instr_bge <= mem_rdata[6:0] == 7'b1100011 && mem_rdata[14:12] == 3'b101;
+ instr_bltu <= mem_rdata[6:0] == 7'b1100011 && mem_rdata[14:12] == 3'b110;
+ instr_bgeu <= mem_rdata[6:0] == 7'b1100011 && mem_rdata[14:12] == 3'b111;
+
+ instr_lb <= mem_rdata[6:0] == 7'b0000011 && mem_rdata[14:12] == 3'b000;
+ instr_lh <= mem_rdata[6:0] == 7'b0000011 && mem_rdata[14:12] == 3'b001;
+ instr_lw <= mem_rdata[6:0] == 7'b0000011 && mem_rdata[14:12] == 3'b010;
+ instr_lbu <= mem_rdata[6:0] == 7'b0000011 && mem_rdata[14:12] == 3'b100;
+ instr_lhu <= mem_rdata[6:0] == 7'b0000011 && mem_rdata[14:12] == 3'b101;
+
+ instr_sb <= mem_rdata[6:0] == 7'b0100011 && mem_rdata[14:12] == 3'b000;
+ instr_sh <= mem_rdata[6:0] == 7'b0100011 && mem_rdata[14:12] == 3'b001;
+ instr_sw <= mem_rdata[6:0] == 7'b0100011 && mem_rdata[14:12] == 3'b010;
+
+ instr_addi <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b000;
+ instr_slti <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b010;
+ instr_sltiu <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b011;
+ instr_xori <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b100;
+ instr_ori <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b110;
+ instr_andi <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b111;
+
+ instr_slli <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b001 && mem_rdata[31:25] == 7'b0000000;
+ instr_srli <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b101 && mem_rdata[31:25] == 7'b0000000;
+ instr_srai <= mem_rdata[6:0] == 7'b0010011 && mem_rdata[14:12] == 3'b101 && mem_rdata[31:25] == 7'b0100000;
+
+ instr_add <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b000 && mem_rdata[31:25] == 7'b0000000;
+ instr_sub <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b000 && mem_rdata[31:25] == 7'b0100000;
+ instr_sll <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b001 && mem_rdata[31:25] == 7'b0000000;
+ instr_slt <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b010 && mem_rdata[31:25] == 7'b0000000;
+ instr_sltu <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b011 && mem_rdata[31:25] == 7'b0000000;
+ instr_xor <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b100 && mem_rdata[31:25] == 7'b0000000;
+ instr_srl <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b101 && mem_rdata[31:25] == 7'b0000000;
+ instr_sra <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b101 && mem_rdata[31:25] == 7'b0100000;
+ instr_or <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b110 && mem_rdata[31:25] == 7'b0000000;
+ instr_and <= mem_rdata[6:0] == 7'b0110011 && mem_rdata[14:12] == 3'b111 && mem_rdata[31:25] == 7'b0000000;
+
+ instr_rdcycle <= ((mem_rdata[6:0] == 7'b1110011 && mem_rdata[31:12] == 'b11000000000000000010) ||
+ (mem_rdata[6:0] == 7'b1110011 && mem_rdata[31:12] == 'b11000000000100000010)) && ENABLE_COUNTERS;
+ instr_rdcycleh <= ((mem_rdata[6:0] == 7'b1110011 && mem_rdata[31:12] == 'b11001000000000000010) ||
+ (mem_rdata[6:0] == 7'b1110011 && mem_rdata[31:12] == 'b11001000000100000010)) && ENABLE_COUNTERS;
+ instr_rdinstr <= (mem_rdata[6:0] == 7'b1110011 && mem_rdata[31:12] == 'b11000000001000000010) && ENABLE_COUNTERS;
+ instr_rdinstrh <= (mem_rdata[6:0] == 7'b1110011 && mem_rdata[31:12] == 'b11001000001000000010) && ENABLE_COUNTERS;
+
+ decoded_rd <= mem_rdata[11:7];
+ decoded_rs1 <= mem_rdata[19:15];
+ decoded_rs2 <= mem_rdata[24:20];
decoder_trigger <= 1;
end
@@ -325,19 +320,19 @@ module picorv32 #(
(* parallel_case *)
case (1'b1)
|{instr_lui, instr_auipc}:
- decoded_imm <= mem_buffer[31:12] << 12;
+ decoded_imm <= mem_rdata[31:12] << 12;
instr_jal:
decoded_imm <= decoded_imm_uj;
instr_jalr:
- decoded_imm <= $signed(mem_buffer[31:20]);
+ decoded_imm <= $signed(mem_rdata[31:20]);
|{instr_beq, instr_bne, instr_blt, instr_bge, instr_bltu, instr_bgeu}:
- decoded_imm <= $signed({mem_buffer[31], mem_buffer[7], mem_buffer[30:25], mem_buffer[11:8], 1'b0});
+ decoded_imm <= $signed({mem_rdata[31], mem_rdata[7], mem_rdata[30:25], mem_rdata[11:8], 1'b0});
|{instr_lb, instr_lh, instr_lw, instr_lbu, instr_lhu}:
- decoded_imm <= $signed(mem_buffer[31:20]);
+ decoded_imm <= $signed(mem_rdata[31:20]);
|{instr_sb, instr_sh, instr_sw}:
- decoded_imm <= $signed({mem_buffer[31:25], mem_buffer[11:7]});
+ decoded_imm <= $signed({mem_rdata[31:25], mem_rdata[11:7]});
|{instr_addi, instr_slti, instr_sltiu, instr_xori, instr_ori, instr_andi}:
- decoded_imm <= $signed(mem_buffer[31:20]);
+ decoded_imm <= $signed(mem_rdata[31:20]);
default:
decoded_imm <= 1'bx;
endcase
@@ -347,46 +342,97 @@ module picorv32 #(
// Main State Machine
- localparam cpu_state_fetch = 0;
- localparam cpu_state_ld_rs1 = 1;
- localparam cpu_state_ld_rs2 = 2;
- localparam cpu_state_exec = 3;
- localparam cpu_state_shift = 4;
- localparam cpu_state_store = 5;
+ localparam cpu_state_trap = 0;
+ localparam cpu_state_fetch = 1;
+ localparam cpu_state_ld_rs1 = 2;
+ localparam cpu_state_ld_rs2 = 3;
+ localparam cpu_state_exec = 4;
+ localparam cpu_state_shift = 5;
localparam cpu_state_stmem = 6;
localparam cpu_state_ldmem = 7;
reg [2:0] cpu_state;
- reg force_mem_do_rinst;
- reg force_mem_do_rdata;
- reg force_mem_do_wdata;
+ reg set_mem_do_rinst;
+ reg set_mem_do_rdata;
+ reg set_mem_do_wdata;
reg mask_decoder_trigger;
reg force_decoder_trigger;
+ reg latched_store;
+ reg latched_stalu;
+ reg latched_branch;
reg latched_is_lu;
reg latched_is_lh;
reg latched_is_lb;
reg [regindex_bits-1:0] latched_rd;
+ reg [31:0] current_pc;
+ assign next_pc = latched_store && latched_branch ? reg_out : reg_next_pc;
+
+ reg [31:0] alu_out;
+ reg alu_out_0;
+
+ always @* begin
+ alu_out_0 = 'bx;
+ (* parallel_case, full_case *)
+ case (1'b1)
+ instr_beq:
+ alu_out_0 = reg_op1 == reg_op2;
+ instr_bne:
+ alu_out_0 = reg_op1 != reg_op2;
+ instr_bge:
+ alu_out_0 = $signed(reg_op1) >= $signed(reg_op2);
+ instr_bgeu:
+ alu_out_0 = reg_op1 >= reg_op2;
+ is_slti_blt_slt:
+ alu_out_0 = $signed(reg_op1) < $signed(reg_op2);
+ is_sltiu_bltu_sltu:
+ alu_out_0 = reg_op1 < reg_op2;
+ endcase
+
+ alu_out = 'bx;
+ (* parallel_case, full_case *)
+ case (1'b1)
+ is_lui_auipc_jal_jalr_addi_add:
+ alu_out = reg_op1 + reg_op2;
+ instr_sub:
+ alu_out = reg_op1 - reg_op2;
+ |{instr_beq, instr_bne, instr_bge, instr_bgeu, is_slti_blt_slt, is_sltiu_bltu_sltu}:
+ alu_out = alu_out_0;
+ instr_xori || instr_xor:
+ alu_out = reg_op1 ^ reg_op2;
+ instr_ori || instr_or:
+ alu_out = reg_op1 | reg_op2;
+ instr_andi || instr_and:
+ alu_out = reg_op1 & reg_op2;
+ endcase
+ end
+
always @(posedge clk) begin
+ trap <= 0;
reg_sh <= 'bx;
reg_out <= 'bx;
- force_mem_do_rinst = 0;
- force_mem_do_rdata = 0;
- force_mem_do_wdata = 0;
+ set_mem_do_rinst = 0;
+ set_mem_do_rdata = 0;
+ set_mem_do_wdata = 0;
mask_decoder_trigger <= 0;
force_decoder_trigger <= 0;
+ reg_alu_out <= alu_out;
+
if (ENABLE_COUNTERS)
count_cycle <= resetn ? count_cycle + 1 : 0;
if (!resetn) begin
- trap <= 0;
reg_pc <= 0;
+ reg_next_pc <= 0;
reg_op1 <= 'bx;
reg_op2 <= 'bx;
if (ENABLE_COUNTERS)
count_instr <= 0;
+ latched_store <= 0;
+ latched_stalu <= 0;
+ latched_branch <= 0;
latched_is_lu <= 0;
latched_is_lh <= 0;
latched_is_lb <= 0;
@@ -394,41 +440,58 @@ module picorv32 #(
end else
(* parallel_case, full_case *)
case (cpu_state)
+ cpu_state_trap: begin
+ trap <= 1;
+ end
cpu_state_fetch: begin
- mem_do_rinst <= (!decoder_trigger || mask_decoder_trigger) && !trap && !force_decoder_trigger;
- mem_do_prefetch <= 0;
+ mem_do_rinst <= (!decoder_trigger || mask_decoder_trigger) && !force_decoder_trigger;
mem_wordsize <= 0;
- if (latched_is_lu || latched_is_lh || latched_is_lb) begin
+ current_pc = reg_next_pc;
+
+ if (latched_branch) begin
+ current_pc = latched_store ? (latched_stalu ? reg_alu_out : reg_out) : reg_next_pc;
+`ifdef DEBUG
+ $display("ST_RD: %2d 0x%08x, BRANCH 0x%08x", latched_rd, reg_pc + 4, current_pc);
+`endif
+ cpuregs[latched_rd] <= reg_pc + 4;
+ end else
+ if (latched_store) begin
`ifdef DEBUG
$display("ST_RD: %2d 0x%08x", latched_rd, reg_out);
`endif
- cpuregs[latched_rd] <= reg_out;
+ cpuregs[latched_rd] <= latched_stalu ? reg_alu_out : reg_out;
end
+ reg_pc <= current_pc;
+ reg_next_pc <= current_pc;
+
+ latched_store <= 0;
+ latched_stalu <= 0;
+ latched_branch <= 0;
latched_is_lu <= 0;
latched_is_lh <= 0;
latched_is_lb <= 0;
+ latched_rd <= decoded_rd;
if ((decoder_trigger && !mask_decoder_trigger) || force_decoder_trigger) begin
`ifdef DEBUG
- $display("DECODE: 0x%08x %-s", reg_pc, instruction);
+ $display("DECODE: 0x%08x %-s", current_pc, instruction);
`endif
+ reg_next_pc <= current_pc + 4;
+
if (instr_trap) begin
- trap <= 1;
- end else if (instr_fence) begin
- mem_do_rinst <= 1;
- reg_pc <= reg_pc + 4;
- cpu_state <= cpu_state_fetch;
+`ifdef DEBUG
+ $display("SBREAK OR UNSUPPORTED INSN AT 0x%08x", current_pc);
+`endif
+ cpu_state <= cpu_state_trap;
end else if (instr_jal) begin
mem_do_rinst <= 1;
- reg_out <= reg_pc + 4;
if (latched_is_lu || latched_is_lh || latched_is_lb)
- reg_pc <= reg_pc + decoded_imm;
+ reg_next_pc <= current_pc + decoded_imm;
else
- reg_pc <= reg_pc + decoded_imm_uj;
- latched_rd <= decoded_rd;
- cpu_state <= cpu_state_store;
+ reg_next_pc <= current_pc + decoded_imm_uj;
+ latched_branch <= 1;
end else if (|{instr_rdcycle, instr_rdcycleh, instr_rdinstr, instr_rdinstrh}) begin
(* parallel_case, full_case *)
case (1'b1)
@@ -441,9 +504,9 @@ module picorv32 #(
instr_rdinstrh:
reg_out <= count_instr[63:32];
endcase
- latched_rd <= decoded_rd;
- cpu_state <= cpu_state_store;
+ latched_store <= 1;
end else begin
+ mem_do_rinst <= 0;
mem_do_prefetch <= !instr_jalr;
cpu_state <= cpu_state_ld_rs1;
end
@@ -493,40 +556,30 @@ module picorv32 #(
end
end
cpu_state_exec: begin
- (* parallel_case, full_case *)
- case (1'b1)
- is_lui_auipc_jal_jalr_addi_add:
- reg_out <= reg_op1 + reg_op2;
- instr_sub:
- reg_out <= reg_op1 - reg_op2;
- instr_beq:
- reg_out <= {31'bx, reg_op1 == reg_op2};
- instr_bne:
- reg_out <= {31'bx, reg_op1 != reg_op2};
- instr_bge:
- reg_out <= {31'bx, $signed(reg_op1) >= $signed(reg_op2)};
- instr_bgeu:
- reg_out <= {31'bx, reg_op1 >= reg_op2};
- is_slti_blt_slt:
- reg_out <= $signed(reg_op1) < $signed(reg_op2);
- is_sltiu_bltu_sltu:
- reg_out <= reg_op1 < reg_op2;
- instr_xori || instr_xor:
- reg_out <= reg_op1 ^ reg_op2;
- instr_ori || instr_or:
- reg_out <= reg_op1 | reg_op2;
- instr_andi || instr_and:
- reg_out <= reg_op1 & reg_op2;
- endcase
- latched_rd <= decoded_rd;
- cpu_state <= cpu_state_store;
+ reg_out <= reg_pc + decoded_imm;
+ if (is_beq_bne_blt_bge_bltu_bgeu) begin
+ latched_rd <= 0;
+ if (mem_done)
+ cpu_state <= cpu_state_fetch;
+ if (alu_out_0) begin
+ latched_store <= 1;
+ latched_branch <= 1;
+ mask_decoder_trigger <= 1;
+ set_mem_do_rinst = 1;
+ end
+ end else begin
+ latched_branch <= instr_jalr;
+ latched_store <= 1;
+ latched_stalu <= 1;
+ cpu_state <= cpu_state_fetch;
+ end
end
cpu_state_shift: begin
if (reg_sh == 0) begin
reg_out <= reg_op1;
mem_do_rinst <= mem_do_prefetch;
- latched_rd <= decoded_rd;
- cpu_state <= cpu_state_store;
+ latched_store <= 1;
+ cpu_state <= cpu_state_fetch;
end else if (reg_sh >= 4) begin
(* parallel_case, full_case *)
case (1'b1)
@@ -545,41 +598,6 @@ module picorv32 #(
reg_sh <= reg_sh - 1;
end
end
- cpu_state_store: begin
- mem_do_rinst <= mem_do_prefetch || mem_do_rinst;
- cpu_state <= cpu_state_fetch;
- if (instr_jal) begin
-`ifdef DEBUG
- $display("ST_RD: %2d 0x%08x", latched_rd, reg_out);
-`endif
- cpuregs[latched_rd] <= reg_out;
- end else if (instr_jalr) begin
-`ifdef DEBUG
- $display("ST_RD: %2d 0x%08x", latched_rd, reg_pc + 4);
-`endif
- cpuregs[latched_rd] <= reg_pc + 4;
- reg_pc <= reg_out;
- end else if (is_beq_bne_blt_bge_bltu_bgeu) begin
- if (reg_out_0) begin
- if (mem_done) begin
- force_mem_do_rinst = 1;
- mask_decoder_trigger <= 1;
- reg_pc <= reg_pc + decoded_imm;
- end else begin
- /* waiting for mem_done */
- cpu_state <= cpu_state_store;
- reg_out[0] <= reg_out_0;
- end
- end else
- reg_pc <= reg_pc + 4;
- end else begin
-`ifdef DEBUG
- $display("ST_RD: %2d 0x%08x", latched_rd, reg_out);
-`endif
- cpuregs[latched_rd] <= reg_out;
- reg_pc <= reg_pc + 4;
- end
- end
cpu_state_stmem: begin
if (!mem_do_prefetch || mem_done) begin
if (!mem_do_wdata) begin
@@ -590,10 +608,9 @@ module picorv32 #(
instr_sw: mem_wordsize <= 0;
endcase
reg_op1 <= reg_op1 + decoded_imm;
- force_mem_do_wdata = 1;
+ set_mem_do_wdata = 1;
end
if (!mem_do_prefetch && mem_done) begin
- reg_pc <= reg_pc + 4;
cpu_state <= cpu_state_fetch;
force_decoder_trigger <= 1;
end
@@ -608,12 +625,12 @@ module picorv32 #(
instr_lh || instr_lhu: mem_wordsize <= 1;
instr_lw: mem_wordsize <= 0;
endcase
+ latched_store <= 1;
latched_is_lu <= is_lbu_lhu_lw;
latched_is_lh <= instr_lh;
latched_is_lb <= instr_lb;
- latched_rd <= decoded_rd;
reg_op1 <= reg_op1 + decoded_imm;
- force_mem_do_rdata = 1;
+ set_mem_do_rdata = 1;
end
if (!mem_do_prefetch && mem_done) begin
(* parallel_case, full_case *)
@@ -622,7 +639,6 @@ module picorv32 #(
latched_is_lh: reg_out <= $signed(mem_buffer[15:0]);
latched_is_lb: reg_out <= $signed(mem_buffer[7:0]);
endcase
- reg_pc <= reg_pc + 4;
force_decoder_trigger <= 1;
cpu_state <= cpu_state_fetch;
end
@@ -635,20 +651,20 @@ module picorv32 #(
`ifdef DEBUG
$display("MISALIGNED WORD: 0x%08x", reg_op1);
`endif
- trap <= 1;
+ cpu_state <= cpu_state_trap;
end
if (mem_wordsize == 1 && reg_op1[0] != 0) begin
`ifdef DEBUG
$display("MISALIGNED HALFWORD: 0x%08x", reg_op1);
`endif
- trap <= 1;
+ cpu_state <= cpu_state_trap;
end
end
if (resetn && mem_do_rinst && reg_pc[1:0] != 0) begin
`ifdef DEBUG
$display("MISALIGNED INSTRUCTION: 0x%08x", reg_pc);
`endif
- trap <= 1;
+ cpu_state <= cpu_state_trap;
end
if (!resetn || mem_done) begin
@@ -658,15 +674,16 @@ module picorv32 #(
mem_do_wdata <= 0;
end
- if (force_mem_do_rinst)
+ if (set_mem_do_rinst)
mem_do_rinst <= 1;
- if (force_mem_do_rdata)
+ if (set_mem_do_rdata)
mem_do_rdata <= 1;
- if (force_mem_do_wdata)
+ if (set_mem_do_wdata)
mem_do_wdata <= 1;
- // optimize for 32bit instr alignment
reg_pc[1:0] <= 0;
+ reg_next_pc[1:0] <= 0;
+ current_pc = 'bx;
end
endmodule
diff --git a/tests/fence_i.S b/tests/fence_i.S
deleted file mode 100644
index 8785c1e..0000000
--- a/tests/fence_i.S
+++ /dev/null
@@ -1,53 +0,0 @@
-# See LICENSE for license details.
-
-#*****************************************************************************
-# fence_i.S
-#-----------------------------------------------------------------------------
-#
-# Test self-modifying code and the fence.i instruction.
-#
-
-#include "riscv_test.h"
-#include "test_macros.h"
-
-RVTEST_RV32U
-RVTEST_CODE_BEGIN
-
-li a3, 111
-la a0, 3f
-la a1, 1f
-la a2, 2f
-lw a0, 0(a0)
-
-# test I$ hit
-.align 6
-sw a0, 0(a1)
-fence.i
-
-1: addi a3, a3, 222
-TEST_CASE( 2, a3, 444, nop )
-
-# test prefetcher hit
-li a4, 100
-1: addi a4, a4, -1
-bnez a4, 1b
-
-sw a0, 0(a2)
-fence.i
-
-.align 6
-2: addi a3, a3, 555
-TEST_CASE( 3, a3, 777, nop )
-
-3: addi a3, a3, 333
-
-TEST_PASSFAIL
-
-RVTEST_CODE_END
-
- .data
-RVTEST_DATA_BEGIN
-
- TEST_DATA
-
-RVTEST_DATA_END