google / xls

XLS: Accelerated HW Synthesis
http://google.github.io/xls/
Apache License 2.0
1.19k stars 174 forks source link

XLS needs to reason more explicitly about the possible range of shifts #1059

Open sandwichmaker opened 1 year ago

sandwichmaker commented 1 year ago

Consider the following fragment of code [pdk=asap7]

pub fn main(a: u64, b: u64) -> u64 {
    if (b > u64:4) {
        u64:0
    } else { 
      (a >> b) as u64
    }
}

The right shift is guaranteed to be in the range [0,4].

The optimized ir is

top fn __main__main(a: bits[64], b: bits[64]) -> bits[64] {
  literal.15: bits[64] = literal(value=5, id=15, pos=[(0,3,4)])
  ult.16: bits[1] = ult(b, literal.15, id=16, pos=[(0,3,4)])
  shrl.6: bits[64] = shrl(a, b, id=6, pos=[(0,6,9)])
  sign_ext.12: bits[64] = sign_ext(ult.16, new_bit_count=64, id=12, pos=[(0,3,4)])
  ret and.13: bits[64] = and(shrl.6, sign_ext.12, id=13, pos=[(0,3,4)])
}

and the generated verilog is

module main(
  input wire clk,
  input wire [63:0] a,
  input wire [63:0] b,
  output wire [63:0] out
);
  // ===== Pipe stage 0:

  // Registers for pipe stage 0:
  reg [63:0] p0_a;
  reg [63:0] p0_b;
  always @ (posedge clk) begin
    p0_a <= a;
    p0_b <= b;
  end

  // ===== Pipe stage 1:
  wire p1_ult_27_comb;
  wire [63:0] p1_shrl_28_comb;
  assign p1_ult_27_comb = p0_b < 64'h0000_0000_0000_0005;
  assign p1_shrl_28_comb = p0_b >= 64'h0000_0000_0000_0040 ? 64'h0000_0000_0000_0000 : p0_a >> p0_b;

  // Registers for pipe stage 1:
  reg p1_ult_27;
  reg [63:0] p1_shrl_28;
  always @ (posedge clk) begin
    p1_ult_27 <= p1_ult_27_comb;
    p1_shrl_28 <= p1_shrl_28_comb;
  end

  // ===== Pipe stage 2:
  wire [63:0] p2_and_34_comb;
  assign p2_and_34_comb = p1_shrl_28 & {64{p1_ult_27}};

  // Registers for pipe stage 2:
  reg [63:0] p2_and_34;
  always @ (posedge clk) begin
    p2_and_34 <= p2_and_34_comb;
  end
  assign out = p2_and_34;
endmodule

The resulting synthesis has area 130.8.

Now if we are explicit and use the code

pub fn main_truncated(a: u64, b: u64) -> u64 {
    if (b > u64:4) {
        u64:0
    } else { 
      let truncated_b = b as u3;
      (a >> truncated_b) as u64
    }
}

The optimized ir is

top fn __main_truncated__main_truncated(a: bits[64], b: bits[64]) -> bits[64] {
  literal.17: bits[64] = literal(value=5, id=17, pos=[(0,11,4)])
  truncated_b: bits[3] = bit_slice(b, start=0, width=3, id=6)
  ult.18: bits[1] = ult(b, literal.17, id=18, pos=[(0,11,4)])
  shrl.7: bits[64] = shrl(a, truncated_b, id=7, pos=[(0,15,9)])
  sign_ext.14: bits[64] = sign_ext(ult.18, new_bit_count=64, id=14, pos=[(0,11,4)])
  ret and.15: bits[64] = and(shrl.7, sign_ext.14, id=15, pos=[(0,11,4)])
}

and the verilog is

module main_truncated(
  input wire clk,
  input wire [63:0] a,
  input wire [63:0] b,
  output wire [63:0] out
);
  // ===== Pipe stage 0:

  // Registers for pipe stage 0:
  reg [63:0] p0_a;
  reg [63:0] p0_b;
  always @ (posedge clk) begin
    p0_a <= a;
    p0_b <= b;
  end

  // ===== Pipe stage 1:
  wire [2:0] p1_truncated_b_comb;
  wire p1_ult_31_comb;
  wire [63:0] p1_shrl_32_comb;
  assign p1_truncated_b_comb = p0_b[2:0];
  assign p1_ult_31_comb = p0_b < 64'h0000_0000_0000_0005;
  assign p1_shrl_32_comb = p0_a >> p1_truncated_b_comb;

  // Registers for pipe stage 1:
  reg p1_ult_31;
  reg [63:0] p1_shrl_32;
  always @ (posedge clk) begin
    p1_ult_31 <= p1_ult_31_comb;
    p1_shrl_32 <= p1_shrl_32_comb;
  end

  // ===== Pipe stage 2:
  wire [63:0] p2_and_38_comb;
  assign p2_and_38_comb = p1_shrl_32 & {64{p1_ult_31}};

  // Registers for pipe stage 2:
  reg [63:0] p2_and_38;
  always @ (posedge clk) begin
    p2_and_38 <= p2_and_38_comb;
  end
  assign out = p2_and_38;
endmodule

and the synth now has area `107.7'

So either XLS needs to be more explicit about the possible range for the shift or the synthesis tool needs to be able to reason about it. IMO this is evident at XLS level and would be good for XLS to do this as an optimization.

cdleary commented 1 year ago

Just a progress note: Alex has a detailed google doc that will make its way into markdown at some point -- originally I was thinking we'd just specialize the "disjoint" cones of logic that fed the individual sides of a select as specialized based on the select condition predicate (being true or false, respectively), but Alex has a proposal for specializing the whole graph so we don't lose information at join points, which should be less surprising to users, and he has a plan to do it in a way that's not combinatorial in the number of select operations (linear instead).