From dd30b57ea6d799b0e2112f1fb130f1bc6df5f86e Mon Sep 17 00:00:00 2001 From: Clifford Wolf Date: Wed, 8 Jul 2015 20:17:03 +0200 Subject: Added TWO_CYCLE_ALU parameter --- README.md | 12 +++++ picorv32.v | 176 +++++++++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 136 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index af9bcf7..98ea96d 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,18 @@ This relaxes the longest data path a bit by adding an additional FF stage at the cost of adding an additional clock cycle delay to the conditional branch instructions. +*Note: Enabling this parameter will be most effective when retiming (aka +"register balancing") is enabled in the synthesis flow.* + +#### TWO_CYCLE_ALU (default = 0) + +This adds an additional FF stage in the ALU data path, improving timing +at the cost of an additional clock cycle for all instructions that use +the ALU. + +*Note: Enabling this parameter will be most effective when retiming (aka +"register balancing") is enabled in the synthesis flow.* + #### CATCH_MISALIGN (default = 1) Set this to 0 to disable the circuitry for catching misaligned memory diff --git a/picorv32.v b/picorv32.v index b73e06b..cf39b29 100644 --- a/picorv32.v +++ b/picorv32.v @@ -38,6 +38,7 @@ module picorv32 #( parameter [ 0:0] LATCHED_MEM_RDATA = 0, parameter [ 0:0] TWO_STAGE_SHIFT = 1, parameter [ 0:0] TWO_CYCLE_COMPARE = 0, + parameter [ 0:0] TWO_CYCLE_ALU = 0, parameter [ 0:0] CATCH_MISALIGN = 1, parameter [ 0:0] CATCH_ILLINSN = 1, parameter [ 0:0] ENABLE_PCPI = 0, @@ -530,46 +531,104 @@ module picorv32 #( reg [31:0] next_irq_pending; reg do_waitirq; - reg [31:0] alu_out, reg_alu_out; - reg alu_out_0, reg_alu_out_0; - reg alu_wait; + reg [31:0] alu_out, alu_out_q; + reg alu_out_0, alu_out_0_q; + reg alu_wait, alu_wait_2; always @* begin - alu_out_0 = 'bx; - (* parallel_case, full_case *) - case (1'b1) - instr_beq: - alu_out_0 = reg_op1 == reg_op2; - instr_bne: - alu_out_0 = reg_op1 != reg_op2; - instr_bge: - alu_out_0 = $signed(reg_op1) >= $signed(reg_op2); - instr_bgeu: - alu_out_0 = reg_op1 >= reg_op2; - is_slti_blt_slt: - alu_out_0 = $signed(reg_op1) < $signed(reg_op2); - is_sltiu_bltu_sltu: - alu_out_0 = reg_op1 < reg_op2; - endcase - - alu_out = 'bx; - (* parallel_case, full_case *) - case (1'b1) - is_lui_auipc_jal_jalr_addi_add: - alu_out = reg_op1 + reg_op2; - instr_sub: - alu_out = reg_op1 - reg_op2; - is_compare: - alu_out = alu_out_0; - instr_xori || instr_xor: - alu_out = reg_op1 ^ reg_op2; - instr_ori || instr_or: - alu_out = reg_op1 | reg_op2; - instr_andi || instr_and: - alu_out = reg_op1 & reg_op2; - endcase end + generate if (TWO_CYCLE_ALU) begin:two_cycle_alu + reg [31:0] alu_add_sub; + reg [31:0] alu_xor_or_and; + reg alu_eq, alu_ltu, alu_lts; + + always @(posedge clk) begin + alu_add_sub <= instr_sub ? reg_op1 - reg_op2 : reg_op1 + reg_op2; + + alu_xor_or_and = 'bx; + (* parallel_case, full_case *) + case (1'b1) + instr_xori || instr_xor: + alu_xor_or_and = reg_op1 ^ reg_op2; + instr_ori || instr_or: + alu_xor_or_and = reg_op1 | reg_op2; + instr_andi || instr_and: + alu_xor_or_and = reg_op1 & reg_op2; + endcase + + alu_eq <= reg_op1 == reg_op2; + alu_lts <= $signed(reg_op1) < $signed(reg_op2); + alu_ltu <= reg_op1 < reg_op2; + end + + always @* begin + alu_out_0 = 'bx; + (* parallel_case, full_case *) + case (1'b1) + instr_beq: + alu_out_0 = alu_eq; + instr_bne: + alu_out_0 = !alu_eq; + instr_bge: + alu_out_0 = !alu_lts; + instr_bgeu: + alu_out_0 = !alu_ltu; + is_slti_blt_slt: + alu_out_0 = alu_lts; + is_sltiu_bltu_sltu: + alu_out_0 = alu_ltu; + endcase + + alu_out = 'bx; + (* parallel_case, full_case *) + case (1'b1) + is_lui_auipc_jal_jalr_addi_add || instr_sub: + alu_out = alu_add_sub; + is_compare: + alu_out = alu_out_0; + |{instr_xori, instr_xor, instr_ori, instr_or, instr_andi, instr_and}: + alu_out = alu_xor_or_and; + endcase + end + end else begin:one_cycle_alu + always @* begin + alu_out_0 = 'bx; + (* parallel_case, full_case *) + case (1'b1) + instr_beq: + alu_out_0 = reg_op1 == reg_op2; + instr_bne: + alu_out_0 = reg_op1 != reg_op2; + instr_bge: + alu_out_0 = $signed(reg_op1) >= $signed(reg_op2); + instr_bgeu: + alu_out_0 = reg_op1 >= reg_op2; + is_slti_blt_slt: + alu_out_0 = $signed(reg_op1) < $signed(reg_op2); + is_sltiu_bltu_sltu: + alu_out_0 = reg_op1 < reg_op2; + endcase + + alu_out = 'bx; + (* parallel_case, full_case *) + case (1'b1) + is_lui_auipc_jal_jalr_addi_add: + alu_out = reg_op1 + reg_op2; + instr_sub: + alu_out = reg_op1 - reg_op2; + is_compare: + alu_out = alu_out_0; + instr_xori || instr_xor: + alu_out = reg_op1 ^ reg_op2; + instr_ori || instr_or: + alu_out = reg_op1 | reg_op2; + instr_andi || instr_and: + alu_out = reg_op1 & reg_op2; + endcase + end + end endgenerate + always @(posedge clk) begin trap <= 0; reg_sh <= 'bx; @@ -578,9 +637,11 @@ module picorv32 #( set_mem_do_rdata = 0; set_mem_do_wdata = 0; - reg_alu_out <= alu_out; - reg_alu_out_0 <= alu_out_0; + alu_out_0_q <= alu_out_0; + alu_out_q <= alu_out; + alu_wait <= 0; + alu_wait_2 <= 0; if (WITH_PCPI && CATCH_ILLINSN) begin if (resetn && pcpi_valid && !pcpi_int_wait) begin @@ -646,13 +707,13 @@ module picorv32 #( (* parallel_case *) case (1'b1) latched_branch: begin - current_pc = latched_store ? (latched_stalu ? reg_alu_out : reg_out) : reg_next_pc; + current_pc = latched_store ? (latched_stalu ? alu_out_q : reg_out) : reg_next_pc; `debug($display("ST_RD: %2d 0x%08x, BRANCH 0x%08x", latched_rd, reg_pc + 4, current_pc);) cpuregs[latched_rd] <= reg_pc + 4; end latched_store && !latched_branch: begin - `debug($display("ST_RD: %2d 0x%08x", latched_rd, latched_stalu ? reg_alu_out : reg_out);) - cpuregs[latched_rd] <= latched_stalu ? reg_alu_out : reg_out; + `debug($display("ST_RD: %2d 0x%08x", latched_rd, latched_stalu ? alu_out_q : reg_out);) + cpuregs[latched_rd] <= latched_stalu ? alu_out_q : reg_out; end ENABLE_IRQ && irq_state[0]: begin cpuregs[latched_rd] <= current_pc; @@ -775,7 +836,10 @@ module picorv32 #( is_lui_auipc_jal: begin reg_op1 <= instr_lui ? 0 : reg_pc; reg_op2 <= decoded_imm; - mem_do_rinst <= mem_do_prefetch; + if (TWO_CYCLE_ALU) + alu_wait <= 1; + else + mem_do_rinst <= mem_do_prefetch; cpu_state <= cpu_state_exec; end ENABLE_IRQ && ENABLE_IRQ_QREGS && instr_getq: begin @@ -830,7 +894,10 @@ module picorv32 #( `debug($display("LD_RS1: %2d 0x%08x", decoded_rs1, decoded_rs1 ? cpuregs[decoded_rs1] : 0);) reg_op1 <= decoded_rs1 ? cpuregs[decoded_rs1] : 0; reg_op2 <= decoded_imm; - mem_do_rinst <= mem_do_prefetch; + if (TWO_CYCLE_ALU) + alu_wait <= 1; + else + mem_do_rinst <= mem_do_prefetch; cpu_state <= cpu_state_exec; end default: begin @@ -850,9 +917,10 @@ module picorv32 #( cpu_state <= cpu_state_shift; end default: begin - if (TWO_CYCLE_COMPARE && is_beq_bne_blt_bge_bltu_bgeu) + if (TWO_CYCLE_ALU || (TWO_CYCLE_COMPARE && is_beq_bne_blt_bge_bltu_bgeu)) begin + alu_wait_2 <= TWO_CYCLE_ALU && (TWO_CYCLE_COMPARE && is_beq_bne_blt_bge_bltu_bgeu); alu_wait <= 1; - else + end else mem_do_rinst <= mem_do_prefetch; cpu_state <= cpu_state_exec; end @@ -896,9 +964,10 @@ module picorv32 #( cpu_state <= cpu_state_shift; end default: begin - if (TWO_CYCLE_COMPARE && is_beq_bne_blt_bge_bltu_bgeu) + if (TWO_CYCLE_ALU || (TWO_CYCLE_COMPARE && is_beq_bne_blt_bge_bltu_bgeu)) begin + alu_wait_2 <= TWO_CYCLE_ALU && (TWO_CYCLE_COMPARE && is_beq_bne_blt_bge_bltu_bgeu); alu_wait <= 1; - else + end else mem_do_rinst <= mem_do_prefetch; cpu_state <= cpu_state_exec; end @@ -906,17 +975,18 @@ module picorv32 #( end cpu_state_exec: begin - latched_store <= TWO_CYCLE_COMPARE ? reg_alu_out_0 : alu_out_0; - latched_branch <= TWO_CYCLE_COMPARE ? reg_alu_out_0 : alu_out_0; + latched_store <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0; + latched_branch <= TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0; reg_out <= reg_pc + decoded_imm; - if (TWO_CYCLE_COMPARE && alu_wait) begin - mem_do_rinst <= mem_do_prefetch; + if ((TWO_CYCLE_ALU || TWO_CYCLE_COMPARE) && (alu_wait || alu_wait_2)) begin + mem_do_rinst <= mem_do_prefetch && !alu_wait_2; + alu_wait <= alu_wait_2; end else if (is_beq_bne_blt_bge_bltu_bgeu) begin latched_rd <= 0; if (mem_done) cpu_state <= cpu_state_fetch; - if (TWO_CYCLE_COMPARE ? reg_alu_out_0 : alu_out_0) begin + if (TWO_CYCLE_COMPARE ? alu_out_0_q : alu_out_0) begin decoder_trigger <= 0; set_mem_do_rinst = 1; end @@ -1187,6 +1257,7 @@ module picorv32_axi #( parameter [ 0:0] ENABLE_REGS_DUALPORT = 1, parameter [ 0:0] TWO_STAGE_SHIFT = 1, parameter [ 0:0] TWO_CYCLE_COMPARE = 0, + parameter [ 0:0] TWO_CYCLE_ALU = 0, parameter [ 0:0] CATCH_MISALIGN = 1, parameter [ 0:0] CATCH_ILLINSN = 1, parameter [ 0:0] ENABLE_PCPI = 0, @@ -1283,6 +1354,7 @@ module picorv32_axi #( .ENABLE_REGS_DUALPORT(ENABLE_REGS_DUALPORT), .TWO_STAGE_SHIFT (TWO_STAGE_SHIFT ), .TWO_CYCLE_COMPARE (TWO_CYCLE_COMPARE ), + .TWO_CYCLE_ALU (TWO_CYCLE_ALU ), .CATCH_MISALIGN (CATCH_MISALIGN ), .CATCH_ILLINSN (CATCH_ILLINSN ), .ENABLE_PCPI (ENABLE_PCPI ), -- cgit