StanfordAHA / lassen

The PE for the second generation CGRA (garnet).
16 stars 4 forks source link

Operand registers not properly clock gated #154

Open Kuree opened 5 years ago

Kuree commented 5 years ago

Currently the CE signal is wired to the tile-level clock-enable signal. Ideally this should be wired to a signal that ands the clock-enable signal and the register mode signal. In other words, we should only use these operand registers if they are intended to be used. Otherwise we will see a significant switching power from these operand registers, which will invalid any design space exploration we plan to do.

leonardt commented 5 years ago

I'm a bit confused, could provide more information/context here? Does this have to do with the lassen/peak description or how the code is compiled to RTL?

From what I can tell, currently the clk_en is a top level port to the PE (https://github.com/StanfordAHA/lassen/blob/master/lassen/sim.py#L381). It is passed through to the register mode instances (https://github.com/StanfordAHA/lassen/blob/master/lassen/mode.py#L33), which is then passed through to the peak register only when we're in delay mode (https://github.com/StanfordAHA/lassen/blob/master/lassen/mode.py#L37) otherwise it is fixed to a constant value depending on the mode. The peak register implements the enable logic explicitly in the code (https://github.com/cdonovick/peak/blob/master/peak/register.py#L14)

Kuree commented 5 years ago

Here is what I had in mind if it's written in SystemVerilog.

typedef enum int {
ByPass = 0,
Delay = 1,
Const = 2
} RegMode;

module PECore (
  input logic clk,
  input logic clk_en,
  input RegMode reg_mode
  ...
)

logic reg_clk_en;

// only enable the register when it's configured as delay
assign reg_clk_en = (reg_mode == Delay) & clk_en;

Register operand_reg1(.clk(clk), .clk_en(reg_clk_en), ...)

What's happening right now is

module RegisterMode_comb (
  output [15:0] O0,
  output  O1,
  output [15:0] O2,
  output [15:0] O3,
  input  clk_en,
  input [15:0] config_data,
  input  config_we,
  input [15:0] const_,
  input [1:0] mode,
  input [15:0] self_register_O,
  input [15:0] value
);

  wire  Mux2xOutBit_inst0__I0;
  wire  Mux2xOutBit_inst0__I1;
  wire  Mux2xOutBit_inst0__O;
  wire  Mux2xOutBit_inst0__S;
  Mux2xOutBit Mux2xOutBit_inst0(
    .I0(Mux2xOutBit_inst0__I0),
    .I1(Mux2xOutBit_inst0__I1),
    .O(Mux2xOutBit_inst0__O),
    .S(Mux2xOutBit_inst0__S)
  );

  wire  Mux2xOutBit_inst1__I0;
  wire  Mux2xOutBit_inst1__I1;
  wire  Mux2xOutBit_inst1__O;
  wire  Mux2xOutBit_inst1__S;
  Mux2xOutBit Mux2xOutBit_inst1(
    .I0(Mux2xOutBit_inst1__I0),
    .I1(Mux2xOutBit_inst1__I1),
    .O(Mux2xOutBit_inst1__O),
    .S(Mux2xOutBit_inst1__S)
  );

  wire  Mux2xOutBit_inst2__I0;
  wire  Mux2xOutBit_inst2__I1;
  wire  Mux2xOutBit_inst2__O;
  wire  Mux2xOutBit_inst2__S;
  Mux2xOutBit Mux2xOutBit_inst2(
    .I0(Mux2xOutBit_inst2__I0),
    .I1(Mux2xOutBit_inst2__I1),
    .O(Mux2xOutBit_inst2__O),
    .S(Mux2xOutBit_inst2__S)
  );

  wire  Mux2xOutBit_inst3__I0;
  wire  Mux2xOutBit_inst3__I1;
  wire  Mux2xOutBit_inst3__O;
  wire  Mux2xOutBit_inst3__S;
  Mux2xOutBit Mux2xOutBit_inst3(
    .I0(Mux2xOutBit_inst3__I0),
    .I1(Mux2xOutBit_inst3__I1),
    .O(Mux2xOutBit_inst3__O),
    .S(Mux2xOutBit_inst3__S)
  );

  wire [15:0] Mux2xOutBits16_inst0__I0;
  wire [15:0] Mux2xOutBits16_inst0__I1;
  wire [15:0] Mux2xOutBits16_inst0__O;
  wire  Mux2xOutBits16_inst0__S;
  Mux2xOutBits16 Mux2xOutBits16_inst0(
    .I0(Mux2xOutBits16_inst0__I0),
    .I1(Mux2xOutBits16_inst0__I1),
    .O(Mux2xOutBits16_inst0__O),
    .S(Mux2xOutBits16_inst0__S)
  );

  wire [15:0] Mux2xOutBits16_inst1__I0;
  wire [15:0] Mux2xOutBits16_inst1__I1;
  wire [15:0] Mux2xOutBits16_inst1__O;
  wire  Mux2xOutBits16_inst1__S;
  Mux2xOutBits16 Mux2xOutBits16_inst1(
    .I0(Mux2xOutBits16_inst1__I0),
    .I1(Mux2xOutBits16_inst1__I1),
    .O(Mux2xOutBits16_inst1__O),
    .S(Mux2xOutBits16_inst1__S)
  );

  wire [15:0] Mux2xOutBits16_inst2__I0;
  wire [15:0] Mux2xOutBits16_inst2__I1;
  wire [15:0] Mux2xOutBits16_inst2__O;
  wire  Mux2xOutBits16_inst2__S;
  Mux2xOutBits16 Mux2xOutBits16_inst2(
    .I0(Mux2xOutBits16_inst2__I0),
    .I1(Mux2xOutBits16_inst2__I1),
    .O(Mux2xOutBits16_inst2__O),
    .S(Mux2xOutBits16_inst2__S)
  );

  wire [15:0] Mux2xOutBits16_inst3__I0;
  wire [15:0] Mux2xOutBits16_inst3__I1;
  wire [15:0] Mux2xOutBits16_inst3__O;
  wire  Mux2xOutBits16_inst3__S;
  Mux2xOutBits16 Mux2xOutBits16_inst3(
    .I0(Mux2xOutBits16_inst3__I0),
    .I1(Mux2xOutBits16_inst3__I1),
    .O(Mux2xOutBits16_inst3__O),
    .S(Mux2xOutBits16_inst3__S)
  );

  wire [15:0] Mux2xOutBits16_inst4__I0;
  wire [15:0] Mux2xOutBits16_inst4__I1;
  wire [15:0] Mux2xOutBits16_inst4__O;
  wire  Mux2xOutBits16_inst4__S;
  Mux2xOutBits16 Mux2xOutBits16_inst4(
    .I0(Mux2xOutBits16_inst4__I0),
    .I1(Mux2xOutBits16_inst4__I1),
    .O(Mux2xOutBits16_inst4__O),
    .S(Mux2xOutBits16_inst4__S)
  );

  wire [15:0] Mux2xOutBits16_inst5__I0;
  wire [15:0] Mux2xOutBits16_inst5__I1;
  wire [15:0] Mux2xOutBits16_inst5__O;
  wire  Mux2xOutBits16_inst5__S;
  Mux2xOutBits16 Mux2xOutBits16_inst5(
    .I0(Mux2xOutBits16_inst5__I0),
    .I1(Mux2xOutBits16_inst5__I1),
    .O(Mux2xOutBits16_inst5__O),
    .S(Mux2xOutBits16_inst5__S)
  );

  wire [15:0] Mux2xOutBits16_inst6__I0;
  wire [15:0] Mux2xOutBits16_inst6__I1;
  wire [15:0] Mux2xOutBits16_inst6__O;
  wire  Mux2xOutBits16_inst6__S;
  Mux2xOutBits16 Mux2xOutBits16_inst6(
    .I0(Mux2xOutBits16_inst6__I0),
    .I1(Mux2xOutBits16_inst6__I1),
    .O(Mux2xOutBits16_inst6__O),
    .S(Mux2xOutBits16_inst6__S)
  );

  wire [15:0] Mux2xOutBits16_inst7__I0;
  wire [15:0] Mux2xOutBits16_inst7__I1;
  wire [15:0] Mux2xOutBits16_inst7__O;
  wire  Mux2xOutBits16_inst7__S;
  Mux2xOutBits16 Mux2xOutBits16_inst7(
    .I0(Mux2xOutBits16_inst7__I0),
    .I1(Mux2xOutBits16_inst7__I1),
    .O(Mux2xOutBits16_inst7__O),
    .S(Mux2xOutBits16_inst7__S)
  );

  wire [15:0] Mux2xOutBits16_inst8__I0;
  wire [15:0] Mux2xOutBits16_inst8__I1;
  wire [15:0] Mux2xOutBits16_inst8__O;
  wire  Mux2xOutBits16_inst8__S;
  Mux2xOutBits16 Mux2xOutBits16_inst8(
    .I0(Mux2xOutBits16_inst8__I0),
    .I1(Mux2xOutBits16_inst8__I1),
    .O(Mux2xOutBits16_inst8__O),
    .S(Mux2xOutBits16_inst8__S)
  );

  wire [15:0] Mux2xOutBits16_inst9__I0;
  wire [15:0] Mux2xOutBits16_inst9__I1;
  wire [15:0] Mux2xOutBits16_inst9__O;
  wire  Mux2xOutBits16_inst9__S;
  Mux2xOutBits16 Mux2xOutBits16_inst9(
    .I0(Mux2xOutBits16_inst9__I0),
    .I1(Mux2xOutBits16_inst9__I1),
    .O(Mux2xOutBits16_inst9__O),
    .S(Mux2xOutBits16_inst9__S)
  );

  wire  bit_const_0_None__out;
  corebit_const #(.value(0)) bit_const_0_None(
    .out(bit_const_0_None__out)
  );

  wire  bit_const_1_None__out;
  corebit_const #(.value(1)) bit_const_1_None(
    .out(bit_const_1_None__out)
  );

  // Instancing generated Module: coreir.const(width:2)
  wire [1:0] const_0_2__out;
  coreir_const #(.value(2'h0),.width(2)) const_0_2(
    .out(const_0_2__out)
  );

  // Instancing generated Module: coreir.const(width:2)
  wire [1:0] const_2_2__out;
  coreir_const #(.value(2'h2),.width(2)) const_2_2(
    .out(const_2_2__out)
  );

  // Instancing generated Module: coreir.const(width:2)
  wire [1:0] const_3_2__out;
  coreir_const #(.value(2'h3),.width(2)) const_3_2(
    .out(const_3_2__out)
  );

  wire  corebit_eq_inst0__I0;
  wire  corebit_eq_inst0__I1;
  wire  corebit_eq_inst0__O;
  corebit_eq corebit_eq_inst0(
    .I0(corebit_eq_inst0__I0),
    .I1(corebit_eq_inst0__I1),
    .O(corebit_eq_inst0__O)
  );

  wire  corebit_eq_inst1__I0;
  wire  corebit_eq_inst1__I1;
  wire  corebit_eq_inst1__O;
  corebit_eq corebit_eq_inst1(
    .I0(corebit_eq_inst1__I0),
    .I1(corebit_eq_inst1__I1),
    .O(corebit_eq_inst1__O)
  );

  wire  corebit_eq_inst2__I0;
  wire  corebit_eq_inst2__I1;
  wire  corebit_eq_inst2__O;
  corebit_eq corebit_eq_inst2(
    .I0(corebit_eq_inst2__I0),
    .I1(corebit_eq_inst2__I1),
    .O(corebit_eq_inst2__O)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst0__in0;
  wire [1:0] coreir_eq_2_inst0__in1;
  wire  coreir_eq_2_inst0__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst0(
    .in0(coreir_eq_2_inst0__in0),
    .in1(coreir_eq_2_inst0__in1),
    .out(coreir_eq_2_inst0__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst1__in0;
  wire [1:0] coreir_eq_2_inst1__in1;
  wire  coreir_eq_2_inst1__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst1(
    .in0(coreir_eq_2_inst1__in0),
    .in1(coreir_eq_2_inst1__in1),
    .out(coreir_eq_2_inst1__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst10__in0;
  wire [1:0] coreir_eq_2_inst10__in1;
  wire  coreir_eq_2_inst10__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst10(
    .in0(coreir_eq_2_inst10__in0),
    .in1(coreir_eq_2_inst10__in1),
    .out(coreir_eq_2_inst10__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst2__in0;
  wire [1:0] coreir_eq_2_inst2__in1;
  wire  coreir_eq_2_inst2__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst2(
    .in0(coreir_eq_2_inst2__in0),
    .in1(coreir_eq_2_inst2__in1),
    .out(coreir_eq_2_inst2__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst3__in0;
  wire [1:0] coreir_eq_2_inst3__in1;
  wire  coreir_eq_2_inst3__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst3(
    .in0(coreir_eq_2_inst3__in0),
    .in1(coreir_eq_2_inst3__in1),
    .out(coreir_eq_2_inst3__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst4__in0;
  wire [1:0] coreir_eq_2_inst4__in1;
  wire  coreir_eq_2_inst4__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst4(
    .in0(coreir_eq_2_inst4__in0),
    .in1(coreir_eq_2_inst4__in1),
    .out(coreir_eq_2_inst4__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst5__in0;
  wire [1:0] coreir_eq_2_inst5__in1;
  wire  coreir_eq_2_inst5__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst5(
    .in0(coreir_eq_2_inst5__in0),
    .in1(coreir_eq_2_inst5__in1),
    .out(coreir_eq_2_inst5__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst6__in0;
  wire [1:0] coreir_eq_2_inst6__in1;
  wire  coreir_eq_2_inst6__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst6(
    .in0(coreir_eq_2_inst6__in0),
    .in1(coreir_eq_2_inst6__in1),
    .out(coreir_eq_2_inst6__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst7__in0;
  wire [1:0] coreir_eq_2_inst7__in1;
  wire  coreir_eq_2_inst7__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst7(
    .in0(coreir_eq_2_inst7__in0),
    .in1(coreir_eq_2_inst7__in1),
    .out(coreir_eq_2_inst7__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst8__in0;
  wire [1:0] coreir_eq_2_inst8__in1;
  wire  coreir_eq_2_inst8__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst8(
    .in0(coreir_eq_2_inst8__in0),
    .in1(coreir_eq_2_inst8__in1),
    .out(coreir_eq_2_inst8__out)
  );

  // Instancing generated Module: coreir.eq(width:2)
  wire [1:0] coreir_eq_2_inst9__in0;
  wire [1:0] coreir_eq_2_inst9__in1;
  wire  coreir_eq_2_inst9__out;
  coreir_eq #(.width(2)) coreir_eq_2_inst9(
    .in0(coreir_eq_2_inst9__in0),
    .in1(coreir_eq_2_inst9__in1),
    .out(coreir_eq_2_inst9__out)
  );

  assign Mux2xOutBit_inst0__I0 = bit_const_0_None__out;

  assign Mux2xOutBit_inst0__I1 = clk_en;

  assign Mux2xOutBit_inst1__I0 = Mux2xOutBit_inst0__O;

  assign Mux2xOutBit_inst0__S = coreir_eq_2_inst1__out;

  assign Mux2xOutBit_inst1__I1 = bit_const_1_None__out;

  assign Mux2xOutBit_inst2__I0 = Mux2xOutBit_inst1__O;

  assign Mux2xOutBit_inst2__I1 = Mux2xOutBit_inst1__O;

  assign Mux2xOutBit_inst3__I1 = Mux2xOutBit_inst1__O;

  assign Mux2xOutBit_inst1__S = corebit_eq_inst1__O;

  assign Mux2xOutBit_inst3__I0 = Mux2xOutBit_inst2__O;

  assign Mux2xOutBit_inst2__S = coreir_eq_2_inst4__out;

  assign O1 = Mux2xOutBit_inst3__O;

  assign Mux2xOutBit_inst3__S = coreir_eq_2_inst8__out;

  assign Mux2xOutBits16_inst0__I0[15:0] = value[15:0];

  assign Mux2xOutBits16_inst0__I1[15:0] = value[15:0];

  assign Mux2xOutBits16_inst2__I0[15:0] = Mux2xOutBits16_inst0__O[15:0];

  assign Mux2xOutBits16_inst0__S = coreir_eq_2_inst0__out;

  assign Mux2xOutBits16_inst1__I0[15:0] = self_register_O[15:0];

  assign Mux2xOutBits16_inst1__I1[15:0] = self_register_O[15:0];

  assign Mux2xOutBits16_inst3__I0[15:0] = Mux2xOutBits16_inst1__O[15:0];

  assign Mux2xOutBits16_inst1__S = coreir_eq_2_inst2__out;

  assign Mux2xOutBits16_inst2__I1[15:0] = config_data[15:0];

  assign Mux2xOutBits16_inst4__I0[15:0] = Mux2xOutBits16_inst2__O[15:0];

  assign Mux2xOutBits16_inst4__I1[15:0] = Mux2xOutBits16_inst2__O[15:0];

  assign Mux2xOutBits16_inst7__I1[15:0] = Mux2xOutBits16_inst2__O[15:0];

  assign Mux2xOutBits16_inst2__S = corebit_eq_inst0__O;

  assign Mux2xOutBits16_inst3__I1[15:0] = self_register_O[15:0];

  assign Mux2xOutBits16_inst5__I0[15:0] = Mux2xOutBits16_inst3__O[15:0];

  assign Mux2xOutBits16_inst6__I0[15:0] = Mux2xOutBits16_inst3__O[15:0];

  assign Mux2xOutBits16_inst6__I1[15:0] = Mux2xOutBits16_inst3__O[15:0];

  assign Mux2xOutBits16_inst9__I1[15:0] = Mux2xOutBits16_inst3__O[15:0];

  assign Mux2xOutBits16_inst3__S = corebit_eq_inst2__O;

  assign Mux2xOutBits16_inst7__I0[15:0] = Mux2xOutBits16_inst4__O[15:0];

  assign Mux2xOutBits16_inst4__S = coreir_eq_2_inst3__out;

  assign Mux2xOutBits16_inst5__I1[15:0] = value[15:0];

  assign Mux2xOutBits16_inst8__I0[15:0] = Mux2xOutBits16_inst5__O[15:0];

  assign Mux2xOutBits16_inst5__S = coreir_eq_2_inst5__out;

  assign Mux2xOutBits16_inst9__I0[15:0] = Mux2xOutBits16_inst6__O[15:0];

  assign Mux2xOutBits16_inst6__S = coreir_eq_2_inst6__out;

  assign O0[15:0] = Mux2xOutBits16_inst7__O[15:0];

  assign Mux2xOutBits16_inst7__S = coreir_eq_2_inst7__out;

  assign Mux2xOutBits16_inst8__I1[15:0] = const_[15:0];

  assign O2[15:0] = Mux2xOutBits16_inst8__O[15:0];

  assign Mux2xOutBits16_inst8__S = coreir_eq_2_inst9__out;

  assign O3[15:0] = Mux2xOutBits16_inst9__O[15:0];

  assign Mux2xOutBits16_inst9__S = coreir_eq_2_inst10__out;

  assign corebit_eq_inst0__I1 = bit_const_1_None__out;

  assign corebit_eq_inst1__I1 = bit_const_1_None__out;

  assign corebit_eq_inst2__I1 = bit_const_1_None__out;

  assign coreir_eq_2_inst10__in1[1:0] = const_0_2__out[1:0];

  assign coreir_eq_2_inst7__in1[1:0] = const_0_2__out[1:0];

  assign coreir_eq_2_inst8__in1[1:0] = const_0_2__out[1:0];

  assign coreir_eq_2_inst9__in1[1:0] = const_0_2__out[1:0];

  assign coreir_eq_2_inst3__in1[1:0] = const_2_2__out[1:0];

  assign coreir_eq_2_inst4__in1[1:0] = const_2_2__out[1:0];

  assign coreir_eq_2_inst5__in1[1:0] = const_2_2__out[1:0];

  assign coreir_eq_2_inst6__in1[1:0] = const_2_2__out[1:0];

  assign coreir_eq_2_inst0__in1[1:0] = const_3_2__out[1:0];

  assign coreir_eq_2_inst1__in1[1:0] = const_3_2__out[1:0];

  assign coreir_eq_2_inst2__in1[1:0] = const_3_2__out[1:0];

  assign corebit_eq_inst0__I0 = config_we;

  assign corebit_eq_inst1__I0 = config_we;

  assign corebit_eq_inst2__I0 = config_we;

  assign coreir_eq_2_inst0__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst1__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst10__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst2__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst3__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst4__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst5__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst6__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst7__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst8__in0[1:0] = mode[1:0];

  assign coreir_eq_2_inst9__in0[1:0] = mode[1:0];

endmodule  // RegisterMode_comb

module RegisterMode (
  input  ASYNCRESET,
  input  CLK,
  output [15:0] O0,
  output [15:0] O1,
  input  clk_en,
  input [15:0] config_data,
  input  config_we,
  input [15:0] const_,
  input [1:0] mode,
  input [15:0] value
);

  wire [15:0] RegisterMode_comb_inst0__O0;
  wire  RegisterMode_comb_inst0__O1;
  wire [15:0] RegisterMode_comb_inst0__O2;
  wire [15:0] RegisterMode_comb_inst0__O3;
  wire  RegisterMode_comb_inst0__clk_en;
  wire [15:0] RegisterMode_comb_inst0__config_data;
  wire  RegisterMode_comb_inst0__config_we;
  wire [15:0] RegisterMode_comb_inst0__const_;
  wire [1:0] RegisterMode_comb_inst0__mode;
  wire [15:0] RegisterMode_comb_inst0__self_register_O;
  wire [15:0] RegisterMode_comb_inst0__value;
  RegisterMode_comb RegisterMode_comb_inst0(
    .O0(RegisterMode_comb_inst0__O0),
    .O1(RegisterMode_comb_inst0__O1),
    .O2(RegisterMode_comb_inst0__O2),
    .O3(RegisterMode_comb_inst0__O3),
    .clk_en(RegisterMode_comb_inst0__clk_en),
    .config_data(RegisterMode_comb_inst0__config_data),
    .config_we(RegisterMode_comb_inst0__config_we),
    .const_(RegisterMode_comb_inst0__const_),
    .mode(RegisterMode_comb_inst0__mode),
    .self_register_O(RegisterMode_comb_inst0__self_register_O),
    .value(RegisterMode_comb_inst0__value)
  );

  wire  Register_inst0__ASYNCRESET;
  wire  Register_inst0__CLK;
  wire [15:0] Register_inst0__O;
  wire  Register_inst0__en;
  wire [15:0] Register_inst0__value;
  Register Register_inst0(
    .ASYNCRESET(Register_inst0__ASYNCRESET),
    .CLK(Register_inst0__CLK),
    .O(Register_inst0__O),
    .en(Register_inst0__en),
    .value(Register_inst0__value)
  );

  assign Register_inst0__value[15:0] = RegisterMode_comb_inst0__O0[15:0];

  assign Register_inst0__en = RegisterMode_comb_inst0__O1;

  assign O0[15:0] = RegisterMode_comb_inst0__O2[15:0];

  assign O1[15:0] = RegisterMode_comb_inst0__O3[15:0];

  assign RegisterMode_comb_inst0__clk_en = clk_en;

  assign RegisterMode_comb_inst0__config_data[15:0] = config_data[15:0];

  assign RegisterMode_comb_inst0__config_we = config_we;

  assign RegisterMode_comb_inst0__const_[15:0] = const_[15:0];

  assign RegisterMode_comb_inst0__mode[1:0] = mode[1:0];

  assign RegisterMode_comb_inst0__self_register_O[15:0] = Register_inst0__O[15:0];

  assign RegisterMode_comb_inst0__value[15:0] = value[15:0];

  assign Register_inst0__ASYNCRESET = ASYNCRESET;

  assign Register_inst0__CLK = CLK;

endmodule  // RegisterMode

Based on my understanding of the verilog above, the clk_en is generated through a muxing. I'm not certain if the synthesis tool can infer the proper clock gate logic from such big mux tree.

EDIT: I read through the python code that you pointed out and it seems that lassen/peak has the correct logic.