aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorClifford Wolf <clifford@clifford.at>2015-06-28 13:07:50 +0200
committerClifford Wolf <clifford@clifford.at>2015-06-28 13:07:50 +0200
commit1f99de511760b3acb546e62623c65d6143b30a77 (patch)
treeb5bc87d754052c1cbd10005e8dd794ab4528eb23
parent923ac360ff3dfd4fccb4e40dcd32c9f560a8afa1 (diff)
downloadpicorv32-1f99de511760b3acb546e62623c65d6143b30a77.tar.gz
picorv32-1f99de511760b3acb546e62623c65d6143b30a77.zip
Improvements in picorv32_pcpi_mul
-rw-r--r--README.md3
-rw-r--r--firmware/start.S4
-rw-r--r--picorv32.v63
-rw-r--r--testbench.v4
4 files changed, 53 insertions, 21 deletions
diff --git a/README.md b/README.md
index 3b84a19..cc6bf91 100644
--- a/README.md
+++ b/README.md
@@ -194,6 +194,9 @@ CPI numbers for a core built without ENABLE_REGS_DUALPORT.
| indirect jump (jalr) | 6 | 6 |
| shift operations | 4-14 | 4-15 |
+When `ENABLE_MUL` is activated, then a `MUL` instruction will execute
+in 42 cycles and a `MULH[SU|U]` instruction will execute in 74 cycles.
+
Dhrystone benchmark results: 0.309 DMIPS/MHz (544 Dhrystones/Second/MHz)
For the Dhrystone benchmark the average CPI is 4.167.
diff --git a/firmware/start.S b/firmware/start.S
index f39665e..278f0d4 100644
--- a/firmware/start.S
+++ b/firmware/start.S
@@ -292,6 +292,10 @@ start:
/* break */
sbreak
+
+/* Hard mul functions for multest.c
+ **********************************/
+
hard_mul:
mul a0, a0, a1
ret
diff --git a/picorv32.v b/picorv32.v
index a311080..abccc23 100644
--- a/picorv32.v
+++ b/picorv32.v
@@ -660,12 +660,6 @@ module picorv32 #(
reg_pc <= current_pc;
reg_next_pc <= current_pc;
- if (WITH_PCPI) begin
- pcpi_insn_valid <= 0;
- pcpi_rs1_valid <= 0;
- pcpi_rs2_valid <= 0;
- end
-
latched_store <= 0;
latched_stalu <= 0;
latched_branch <= 0;
@@ -726,6 +720,10 @@ module picorv32 #(
reg_sh <= decoded_rs2 ? cpuregs[decoded_rs2] : 0;
reg_op2 <= decoded_rs2 ? cpuregs[decoded_rs2] : 0;
if (pcpi_int_ready) begin
+ mem_do_rinst <= 1;
+ pcpi_insn_valid <= 0;
+ pcpi_rs1_valid <= 0;
+ pcpi_rs2_valid <= 0;
reg_out <= pcpi_int_rd;
latched_store <= pcpi_int_rd_valid;
cpu_state <= cpu_state_fetch;
@@ -848,6 +846,10 @@ module picorv32 #(
if (WITH_PCPI && pcpi_insn_valid) begin
pcpi_rs2_valid <= 1;
if (pcpi_int_ready) begin
+ mem_do_rinst <= 1;
+ pcpi_insn_valid <= 0;
+ pcpi_rs1_valid <= 0;
+ pcpi_rs2_valid <= 0;
reg_out <= pcpi_int_rd;
latched_store <= pcpi_int_rd_valid;
cpu_state <= cpu_state_fetch;
@@ -1023,7 +1025,10 @@ endmodule
* picorv32_pcpi_mul
***************************************************************/
-module picorv32_pcpi_mul (
+module picorv32_pcpi_mul #(
+ // increasing this parameter increases performance and core size
+ parameter STEPS_AT_ONCE = 1
+) (
input clk, resetn,
input pcpi_insn_valid,
@@ -1067,9 +1072,32 @@ module picorv32_pcpi_mul (
end
reg [63:0] rs1, rs2, rd, rdx;
+ reg [63:0] next_rs1, next_rs2, next_rd, next_rdx, next_rdt;
reg [6:0] mul_counter;
reg mul_waiting;
reg mul_finish;
+ integer i;
+
+ // carry save accumulator
+ always @* begin
+ next_rd = rd;
+ next_rdx = rdx;
+ next_rs1 = rs1;
+ next_rs2 = rs2;
+
+ for (i = 0; i < STEPS_AT_ONCE; i=i+1) begin
+ if (next_rs1[0]) begin
+ next_rdt = (next_rd ^ next_rdx) ^ next_rs2;
+ next_rdx = ((next_rd & next_rdx) | (next_rd & next_rs2) | (next_rdx & next_rs2)) << 1;
+ end else begin
+ next_rdt = next_rd ^ next_rdx;
+ next_rdx = (next_rd & next_rdx) << 1;
+ end
+ next_rd = next_rdt;
+ next_rs1 = next_rs1 >> 1;
+ next_rs2 = next_rs2 << 1;
+ end
+ end
always @(posedge clk) begin
mul_finish <= 0;
@@ -1089,21 +1117,16 @@ module picorv32_pcpi_mul (
rd <= 0;
rdx <= 0;
- mul_counter <= instr_any_mulh ? 64 : 32;
+ mul_counter <= (instr_any_mulh ? 63 - STEPS_AT_ONCE : 31 - STEPS_AT_ONCE);
mul_waiting <= !mul_start;
end else begin
- // carry save accumulator
- if (rs1[0]) begin
- rd <= rd ^ rdx ^ rs2;
- rdx <= ((rd & rdx) | (rd & rs2) | (rdx & rs2)) << 1;
- end else begin
- rd <= rd ^ rdx;
- rdx <= (rd & rdx) << 1;
- end
- rs1 <= rs1 >> 1;
- rs2 <= rs2 << 1;
- mul_counter <= mul_counter - 1;
- if (!mul_counter) begin
+ rd <= next_rd;
+ rdx <= next_rdx;
+ rs1 <= next_rs1;
+ rs2 <= next_rs2;
+
+ mul_counter <= mul_counter - STEPS_AT_ONCE;
+ if (mul_counter[6]) begin
mul_finish <= 1;
mul_waiting <= 1;
end
diff --git a/testbench.v b/testbench.v
index 135fbdc..53badd3 100644
--- a/testbench.v
+++ b/testbench.v
@@ -244,10 +244,12 @@ module testbench;
$finish;
end
+ integer cycle_counter;
always @(posedge clk) begin
+ cycle_counter <= resetn ? cycle_counter + 1 : 0;
if (resetn && trap) begin
repeat (10) @(posedge clk);
- $display("TRAP");
+ $display("TRAP after %1d clock cycles", cycle_counter);
$finish;
end
end