SQRT operation for FP16ALT only works when all op[n] lanes are assigned to the same input

Hello,

I am trying to set up the FPU for FP16ALT operations, here is my config:

  fpnew_top #(
    .Features       ( fpnew_pkg::VECTOR_CORE         ),
    .Implementation ( fpnew_pkg::VECTOR_CORE_IMPL    ),
    .TagType        ( logic [vcore_tag_width-1:0]    ),
    .PulpDivsqrt    ( 1'b1                           )
  )

  localparam fpu_features_t VECTOR_CORE = '{
    Width:         32,
    EnableVectors: 1'b1,
    EnableNanBox:  1'b1,
    FpFmtMask:     5'b10001,
    IntFmtMask:    4'b0000
  };

  localparam fpu_implementation_t VECTOR_CORE_IMPL = '{
    PipeRegs:   '{default: 1},
    UnitTypes:  '{'{default: PARALLEL}, // ADDMUL
                  '{default: MERGED}, // DIVSQRT
                  '{default: PARALLEL}, // NONCOMP
                  '{default: MERGED}},  // CONV
    PipeConfig: BEFORE
  };

And here is the operation I am performing:

    // Initialize inputs
    operands_i = '0;
    rnd_mode_i = fpnew_pkg::RNE;
    op_i = fpnew_pkg::ADD;
    op_mod_i = 1'b0;
    src_fmt_i =fpnew_pkg::FP16ALT;
    dst_fmt_i =fpnew_pkg::FP16ALT;
    int_fmt_i =fpnew_pkg::INT32;
    vectorial_op_i = 1'b1;
    tag_i = 'b0;
    simd_mask_i = 2'b11;
    in_valid_i = 1'b0;
    flush_i = 1'b0;
    out_ready_i = 1'b1;

    tag_i = 5;
    op_i = fpnew_pkg::SQRT;
    op_mod_i = 1'b0;
    operands_i[0] = {{{16'h4210} , {16'h40A0}}};
    operands_i[1] = {32{1'b1}};
    operands_i[2] = {32{1'b1}};
    cycle_input_valid(); //TEST CASE:         5, RESULT 1: 7fc0, RESULT 2: 7fc0 (NaN, NaN)

    tag_i = 6;
    op_i = fpnew_pkg::SQRT;
    op_mod_i = 1'b0;
    operands_i[0] = {{{16'h4210} , {16'h40A0}}}; // 36 and 5
    operands_i[1] = {{{16'h4210} , {16'h40A0}}}; // 36 and 5
    operands_i[2] = {{{16'h4210} , {16'h40A0}}}; // 36 and 5
    cycle_input_valid(); //TEST CASE:         6, RESULT 1: 40c0, RESULT 2: 400f (correct outputs)

The output is NaN if I apply the values to only one of the operand lanes (first example) (does not matter which one), and set others to 32'b1. Only when I apply it to all lanes (second example) I get the correct/non NaN output. I can't find anything in README or docs/README that explains this. Is this expected behaviour?

Thank you

openhwgroup / cvfpu

SQRT operation for FP16ALT only works when all op[n] lanes are assigned to the same input #117