chipsalliance / rocket-chip

Rocket Chip Generator
Other
3.27k stars 1.13k forks source link

RoCC: io.mem.req.ready stuck #3635

Open ARF1939261764 opened 6 months ago

ARF1939261764 commented 6 months ago

Type of issue: bug report

Impact: rocc memory interface


  logic[15:0] cnt;
  localparam BASE = 32'h60010000;

  assign rocc_cmd_ready = 1'b1;

  assign rocc_mem_s1_kill = 1'b0;
  assign rocc_mem_s2_kill = 1'b0;

  assign rocc_busy = 1'b0;
  assign rocc_interrupt = 1'b0;

  assign rocc_fpu_req_valid = 1'b0;
  assign rocc_fpu_resp_ready = 1'b1;

  always @(posedge clock or posedge reset) begin
    if(reset) begin
      rocc_mem_req_valid <= 1'd0;
    end
    else begin
      if(rocc_resp_valid & rocc_resp_ready) begin
        rocc_mem_req_valid <= 1'd1;
        rocc_mem_req_bits_addr <= BASE;
        rocc_mem_req_bits_tag <= cnt[5:0];
        rocc_mem_req_bits_cmd <= 1;
        rocc_mem_req_bits_size <= 3;
        rocc_mem_req_bits_signed <= 0;
        rocc_mem_req_bits_phys <= 0;
        rocc_mem_req_bits_no_alloc <= 0;
        rocc_mem_req_bits_dprv <= 3;
      end
      else if(rocc_mem_req_valid && rocc_mem_req_ready) begin
        cnt <= cnt + 1;
        if(cnt < 1024) begin
          rocc_mem_req_bits_addr <= BASE + cnt * 8;
        end
        else begin
          rocc_mem_req_valid <= 1'd0;
        end
      end
    end
  end

  /* Accumulate rs1 and rs2 into an accumulator */
  reg [xLen-1:0] acc;
  reg doResp;
  reg [4:0] rocc_cmd_bits_inst_rd_d;
  always @ (posedge clock) begin
    if (reset) begin
      acc <= {xLen{1'b0}};
      doResp <= 1'b0;
      rocc_cmd_bits_inst_rd_d <= 5'b0;
    end
    else if (rocc_cmd_valid && rocc_cmd_ready) begin
      doResp                  <= 1;
      rocc_cmd_bits_inst_rd_d <= rocc_cmd_bits_inst_rd;
      acc                     <= acc + rocc_cmd_bits_rs1 + rocc_cmd_bits_rs2;
    end
    else begin
      doResp <= 1'b0;
    end
  end

  assign rocc_resp_valid = doResp;
  assign rocc_resp_bits_rd = rocc_cmd_bits_inst_rd_d;
  assign rocc_resp_bits_data = acc;
int main(void){
  ROCC_INSTRUCTION(3,0);
}

image

I used the chipyard BlackBoxExample configuration, along with the code above, to produce the waveform in the image,my purpose is to write 1024 consecutive 8Byte sizes of data to address 0x60010000 , with successive increments of address. But on the third write, the rocc_mem_req_ready signal pulled down, and never pulled up.

And I found that this was related to the address that was written, for example I had no problem doing the same to the address 0x80010000 ,The waveform is shown in the figure below: image

The difference between these two addresses is that the address 0x60010000 is mapped to the mmio axi interface, but the address 0x80010000 is mapped to the mem axi interface

image On accessing address 0x60010000, the write request made is put into SimpleHellaCacheIF's retransmission queue. inflight indicates how many requests are currently not responding to the resp. A value of 3 indicates that two requests are waiting for the resp(two have a bit of 1). But according to the information I read on the Internet, rocc's memory write operation does not respond, so I don't know if this is a bug.

image Because the read operation has response, the read request can be completed normally at 0x60010000

Please tell us about your environment: Rocket 1.6

What is the use case for changing the behavior? store operations can be performed continuously

caizixian commented 4 weeks ago

I had a hacky fix that the core no long locks up after issuing multiple RoCC instructions that result in MMIO stores (TL Put I think) on the same address.

However, I'm not sure whether it's sound as I suspect that it causes problems with fromhost/tohost on FireSim (process waits on https://github.com/ucb-bar/libgloss-htif/blob/39234a16247ab1fa234821b251f1f1870c3de343/misc/htif.c#L26 indefinitely). Interestingly, it works fine on Verilator.

The fix roughly looks like

diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala
index 0467d149a..e6af58ff2 100644
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@@ -622,7 +622,9 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {

   // grant
   val (d_first, d_last, d_done, d_address_inc) = edge.addr_inc(tl_out.d)
-  val (d_opc, grantIsUncached, grantIsUncachedData) = {
+  // FIXME: grantIsUncachedAccess is a misleading name
+  // usingDataScratchpad still uses uncachedGrantOpcodesSansData
+  val (d_opc, grantIsUncached, grantIsUncachedAccess) = {
     val uncachedGrantOpcodesSansData = Seq(AccessAck, HintAck)
     val uncachedGrantOpcodesWithData = Seq(AccessAckData)
     val uncachedGrantOpcodes = uncachedGrantOpcodesWithData ++ uncachedGrantOpcodesSansData
@@ -634,7 +636,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
       val data = DecodeLogic(opc, uncachedGrantOpcodesWithData, uncachedGrantOpcodesSansData)
       (opc, true.B, data)
     } else {
-      (whole_opc, whole_opc.isOneOf(uncachedGrantOpcodes), whole_opc.isOneOf(uncachedGrantOpcodesWithData))
+      (whole_opc, whole_opc.isOneOf(uncachedGrantOpcodes), whole_opc.isOneOf(Seq(AccessAckData, AccessAck)))
     }
   }
   tl_d_data_encoded := encodeData(tl_out.d.bits.data, tl_out.d.bits.corrupt && !io.ptw.customCSRs.suppressCorruptOnGrantData && !grantIsUncached)
@@ -665,11 +667,12 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
           f := false.B
         }
       }
-      when (grantIsUncachedData) {
+      when (grantIsUncachedAccess) {
         if (!cacheParams.separateUncachedResp) {
           if (!cacheParams.pipelineWayMux)
             s1_data_way := 1.U << nWays
-          s2_req.cmd := M_XRD
+          // Still respond to cache clients for AccessAck in response to Put
+          s2_req.cmd := Mux(tl_out.d.bits.opcode === AccessAckData, M_XRD, M_XWR)
           s2_req.size := uncachedResp.size
           s2_req.signed := uncachedResp.signed
           s2_req.tag := uncachedResp.tag
@@ -726,7 +729,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
     // don't accept uncached grants if there's a structural hazard on s2_data...
     val blockUncachedGrant = Reg(Bool())
     blockUncachedGrant := dataArb.io.out.valid
-    when (grantIsUncachedData && (blockUncachedGrant || s1_valid)) {
+    when (grantIsUncachedAccess && (blockUncachedGrant || s1_valid)) {
       tl_out.d.ready := false.B
       // ...but insert bubble to guarantee grant's eventual forward progress
       when (tl_out.d.valid) {
@@ -923,7 +926,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   val s2_uncached_data_word = RegEnable(s1_uncached_data_word, io.cpu.replay_next)
   val doUncachedResp = RegNext(io.cpu.replay_next)
   io.cpu.resp.valid := (s2_valid_hit_pre_data_ecc || doUncachedResp) && !s2_data_error
-  io.cpu.replay_next := tl_out.d.fire() && grantIsUncachedData && !cacheParams.separateUncachedResp.B
+  io.cpu.replay_next := tl_out.d.fire() && grantIsUncachedAccess && !cacheParams.separateUncachedResp.B
   when (doUncachedResp) {
     assert(!s2_valid_hit)
     io.cpu.resp.bits.replay := true.B
@@ -931,13 +934,13 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
   }

   io.cpu.uncached_resp.map { resp =>
-    resp.valid := tl_out.d.valid && grantIsUncachedData
+    resp.valid := tl_out.d.valid && grantIsUncachedAccess
     resp.bits.tag := uncachedResp.tag
     resp.bits.size := uncachedResp.size
     resp.bits.signed := uncachedResp.signed
     resp.bits.data := new LoadGen(uncachedResp.size, uncachedResp.signed, uncachedResp.addr, s1_uncached_data_word, false.B, wordBytes).data
     resp.bits.data_raw := s1_uncached_data_word
-    when (grantIsUncachedData && !resp.ready) {
+    when (grantIsUncachedAccess && !resp.ready) {
       tl_out.d.ready := false.B
     }
   }

@jerryz123 any thought on this?