llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.96k stars 11.53k forks source link

Excessive stack usage with array bounds sanitizer when targeting PowerPC in Linux kernel's drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c #98367

Open nathanchance opened 2 months ago

nathanchance commented 2 months ago

A recent change in Fedora's configuration to build certain files in the drivers/gpu/drm folder of the Linux kernel with -Werror revealed an instance of -Wframe-larger-than in drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c that appears only when building for PowerPC. It shows up with -fsanitize=array-bounds, which becomes apparent why from the reproducer that I got from cvise:

enum { true } typedef bool;
struct amdgpu_gfx_config {
  unsigned gb_addr_config;
};
struct common_firmware_header {
  int ucode_size_bytes;
};
enum { AMDGPU_UCODE_ID_VCN, AMDGPU_UCODE_ID_MAXIMUM };
enum amdgpu_firmware_load_type { AMDGPU_FW_LOAD_PSP };
struct amdgpu_firmware_info {
  int tmr_mc_addr_lo;
  int tmr_mc_addr_hi;
};
struct amdgpu_firmware {
  struct amdgpu_firmware_info ucode[AMDGPU_UCODE_ID_MAXIMUM];
  enum amdgpu_firmware_load_type load_type;
};
struct amdgpu_vcn_fw_shared {
  void *cpu_addr;
};
struct amdgpu_vcn_inst {
  long gpu_addr;
  int ring_enc[3];
  void *dpg_sram_cpu_addr;
  int *dpg_sram_curr_addr;
  struct amdgpu_vcn_fw_shared fw_shared;
};
struct amdgpu_vcn {
  int fw[4];
  struct amdgpu_vcn_inst inst[4];
  unsigned harvest_config;
};
struct amdgpu_fw_shared_unified_queue_struct {
  char queue_mode;
} *vcn_v5_0_0_start_fw_shared;
struct amdgpu_virt {
  int caps;
} amdgpu_sriov_wreg(int, int);
enum amd_hw_ip_block_type { VCN_HWIP, MAX_HWIP };
struct amdgpu_device {
  int pg_flags;
  struct amdgpu_gfx_config gfx;
  struct amdgpu_vcn vcn;
  struct amdgpu_firmware firmware;
  int *reg_offset[MAX_HWIP][4];
  struct amdgpu_virt virt;
} vcn_v5_0_0_mc_resume_dpg_mode_adev,
    *vcn_v5_0_0_disable_static_power_gating_adev,
    *vcn_v5_0_0_start_dpg_mode_adev;
int vcn_v5_0_0_mc_resume_offset, vcn_v5_0_0_mc_resume_inst,
    vcn_v5_0_0_mc_resume_dpg_mode_inst_idx,
    vcn_v5_0_0_mc_resume_dpg_mode_offset, vcn_v5_0_0_mc_resume_dpg_mode_size,
    vcn_v5_0_0_disable_static_power_gating_adev_6_0,
    vcn_v5_0_0_disable_static_power_gating_inst,
    vcn_v5_0_0_start_dpg_mode_inst_idx, vcn_v5_0_0_start_dpg_mode_ring,
    vcn_v5_0_0_start_dpg_mode_ring_3, vcn_v5_0_0_start_dpg_mode_tmp,
    vcn_v5_0_0_start_dpg_mode_internal_reg_offset,
    vcn_v5_0_0_start_dpg_mode_addr, vcn_v5_0_0_start_tmp, vcn_v5_0_0_start_i,
    vcn_v5_0_0_start_j, vcn_v5_0_0_start_r, vcn_v5_0_0_start_status,
    vcn_v5_0_0_start_ring;
char (*vcn_v5_0_0_start_dpg_mode_adev_5_0)(struct amdgpu_device *,
                                           enum amd_hw_ip_block_type, char);
char vcn_v5_0_0_start_dpg_mode___trans_tmp_15,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_14,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_13,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_12,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_11,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_10,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_9,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_8,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_7,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_6,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_5,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_4,
    vcn_v5_0_0_start_dpg_mode___trans_tmp_2;
bool vcn_v5_0_0_start_dpg_mode_aon_range;
long vcn_v5_0_0_start_ring_0;
void amdgpu_device_wait_on_rreg(int, int);
int amdgpu_device_rreg(struct amdgpu_device *, int);
static int vcn_v5_0_0_start_dpg_mode() {
  struct amdgpu_fw_shared_unified_queue_struct *fw_shared =
      vcn_v5_0_0_start_dpg_mode_adev->vcn
          .inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
          .fw_shared.cpu_addr;
  vcn_v5_0_0_start_dpg_mode_tmp =
      vcn_v5_0_0_start_dpg_mode_adev
          ? vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]
          : 0;
  amdgpu_device_wait_on_rreg(
      vcn_v5_0_0_start_dpg_mode_adev
          ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
      0);
  vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
      .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode_adev->vcn
                                .inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
                                .dpg_sram_cpu_addr;
  vcn_v5_0_0_start_dpg_mode___trans_tmp_2
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            vcn_v5_0_0_start_dpg_mode_tmp)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode___trans_tmp_4
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }))
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }));
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = ({
    vcn_v5_0_0_start_dpg_mode_adev
        ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
  });
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode___trans_tmp_5 =
      vcn_v5_0_0_start_dpg_mode___trans_tmp_6;
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode___trans_tmp_7
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            1 | ({
                  vcn_v5_0_0_start_dpg_mode_adev
                      ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx]
                                  [1];
                }) << 4)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }));
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = ({
    vcn_v5_0_0_start_dpg_mode_adev
        ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
  });
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode___trans_tmp_8 =
      vcn_v5_0_0_start_dpg_mode___trans_tmp_9;
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            vcn_v5_0_0_start_dpg_mode_tmp)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode___trans_tmp_10
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }))
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }));
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = ({
    vcn_v5_0_0_start_dpg_mode_adev
        ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
  });
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode_tmp;
  struct common_firmware_header *hdr =
      (struct common_firmware_header *)&vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn
          .fw[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx];
  vcn_v5_0_0_mc_resume_dpg_mode_size = vcn_v5_0_0_mc_resume_dpg_mode_offset;
  vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn
      .inst[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx]
      .dpg_sram_curr_addr = 0;
  *vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn
       .inst[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn
                                 .inst[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx]
                                 .gpu_addr +
                             hdr->ucode_size_bytes;
  vcn_v5_0_0_start_dpg_mode___trans_tmp_11 =
      vcn_v5_0_0_start_dpg_mode___trans_tmp_12 = 0;
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }))
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }));
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = ({
    vcn_v5_0_0_start_dpg_mode_adev
        ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1] =
        vcn_v5_0_0_start_dpg_mode_aon_range;
  });
  *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
       .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode___trans_tmp_13 =
      vcn_v5_0_0_start_dpg_mode___trans_tmp_14;
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            2)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode___trans_tmp_15 = vcn_v5_0_0_start_dpg_mode_adev_5_0(
      vcn_v5_0_0_start_dpg_mode_adev, VCN_HWIP,
      vcn_v5_0_0_start_dpg_mode_inst_idx);
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_adev
                  ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1];
            }))
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            ({
              vcn_v5_0_0_start_dpg_mode_addr =
                  vcn_v5_0_0_start_dpg_mode_adev
                      ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx]
                                  [1];
              if (vcn_v5_0_0_start_dpg_mode_addr)
                vcn_v5_0_0_start_dpg_mode_internal_reg_offset =
                    vcn_v5_0_0_start_dpg_mode_addr;
              vcn_v5_0_0_start_dpg_mode_addr;
            }));
  vcn_v5_0_0_start_dpg_mode_ring =
      vcn_v5_0_0_start_dpg_mode_adev->vcn
              .inst[vcn_v5_0_0_start_dpg_mode_inst_idx]
              .ring_enc[0]
          ? vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]
          : amdgpu_device_rreg(
                vcn_v5_0_0_start_dpg_mode_adev,
                vcn_v5_0_0_start_dpg_mode_adev
                    ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx]
                                [1]);
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            vcn_v5_0_0_start_dpg_mode_tmp)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode_adev
      ? vcn_v5_0_0_start_dpg_mode_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]
      : amdgpu_device_rreg(
            vcn_v5_0_0_start_dpg_mode_adev,
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]);
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            vcn_v5_0_0_start_dpg_mode_tmp)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  vcn_v5_0_0_start_dpg_mode_adev
      ? vcn_v5_0_0_start_dpg_mode_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]
      : amdgpu_device_rreg(
            vcn_v5_0_0_start_dpg_mode_adev,
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]);
  vcn_v5_0_0_start_dpg_mode_adev
      ? vcn_v5_0_0_start_dpg_mode_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]
      : amdgpu_device_rreg(
            vcn_v5_0_0_start_dpg_mode_adev,
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]);
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            vcn_v5_0_0_start_dpg_mode_tmp)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  fw_shared->queue_mode &= 0;
  vcn_v5_0_0_start_dpg_mode_adev
      ? amdgpu_sriov_wreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            vcn_v5_0_0_start_dpg_mode_ring_3)
      : amdgpu_device_wait_on_rreg(
            vcn_v5_0_0_start_dpg_mode_adev
                ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1],
            0);
  return 0;
}
void vcn_v5_0_0_start(struct amdgpu_device *adev) {
  for (vcn_v5_0_0_start_i = 0;;) {
    vcn_v5_0_0_start_fw_shared =
        adev->vcn.inst[vcn_v5_0_0_start_i].fw_shared.cpu_addr;
    if (adev->pg_flags) {
      vcn_v5_0_0_start_dpg_mode();
      continue;
    }
    amdgpu_device_wait_on_rreg(
        vcn_v5_0_0_disable_static_power_gating_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst]
                        [1],
        0);
    vcn_v5_0_0_disable_static_power_gating_adev
        ? amdgpu_sriov_wreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0)
        : amdgpu_device_wait_on_rreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0);
    amdgpu_device_wait_on_rreg(
        vcn_v5_0_0_disable_static_power_gating_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst]
                        [1],
        4);
    vcn_v5_0_0_disable_static_power_gating_adev_6_0
        ? amdgpu_sriov_wreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0)
        : amdgpu_device_wait_on_rreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0);
    amdgpu_device_wait_on_rreg(
        vcn_v5_0_0_disable_static_power_gating_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst]
                        [1],
        5);
    vcn_v5_0_0_disable_static_power_gating_adev_6_0
        ? amdgpu_sriov_wreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0)
        : amdgpu_device_wait_on_rreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0);
    vcn_v5_0_0_disable_static_power_gating_adev_6_0
        ? amdgpu_sriov_wreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0)
        : amdgpu_device_wait_on_rreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0);
    amdgpu_device_wait_on_rreg(
        vcn_v5_0_0_disable_static_power_gating_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst]
                        [1],
        0);
    vcn_v5_0_0_disable_static_power_gating_adev_6_0
        ? amdgpu_sriov_wreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0)
        : amdgpu_device_wait_on_rreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0);
    amdgpu_device_wait_on_rreg(
        vcn_v5_0_0_disable_static_power_gating_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst]
                        [1],
        0);
    vcn_v5_0_0_disable_static_power_gating_adev_6_0
        ? amdgpu_sriov_wreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0)
        : amdgpu_device_wait_on_rreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0);
    amdgpu_device_wait_on_rreg(
        vcn_v5_0_0_disable_static_power_gating_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst]
                        [1],
        0);
    vcn_v5_0_0_disable_static_power_gating_adev_6_0
        ? amdgpu_sriov_wreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0)
        : amdgpu_device_wait_on_rreg(
              vcn_v5_0_0_disable_static_power_gating_adev
                  ->reg_offset[VCN_HWIP]
                              [vcn_v5_0_0_disable_static_power_gating_inst][1],
              0);
    amdgpu_device_wait_on_rreg(
        vcn_v5_0_0_disable_static_power_gating_adev
            ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst]
                        [1],
        0);
    adev->virt.caps ? adev->reg_offset[vcn_v5_0_0_start_i]
    : adev->reg_offset[vcn_v5_0_0_start_i][1]
        ? amdgpu_sriov_wreg(adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
                            vcn_v5_0_0_start_tmp)
        : amdgpu_device_wait_on_rreg(
              adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0);
    int tmp_ = adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1];
    amdgpu_device_wait_on_rreg(
        adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
        adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]);
    amdgpu_device_wait_on_rreg(
        adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
        adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]);
    tmp_ = adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1];
    amdgpu_device_wait_on_rreg(
        adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_);
    adev->virt.caps         ? adev->reg_offset[vcn_v5_0_0_start_i]         :                               adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i]? : amdgpu_device_wait_on_rreg(                           adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0);
    adev->virt.caps ? adev->reg_offset[vcn_v5_0_0_start_i]
    : adev->reg_offset[vcn_v5_0_0_start_i][1]
        ? amdgpu_sriov_wreg(adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
                            0)
        : amdgpu_device_wait_on_rreg(
              adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0);
    struct amdgpu_device *__trans_tmp_1 = adev;
    struct common_firmware_header *hdr =
        (struct common_firmware_header *)&adev->vcn
            .fw[vcn_v5_0_0_mc_resume_inst];
    __trans_tmp_1->virt.caps
        ? __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst]
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst]
                  .tmr_mc_addr_lo);
    __trans_tmp_1->virt.caps
        ? __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst]
              .tmr_mc_addr_hi
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst]
                  .tmr_mc_addr_hi);
    amdgpu_device_wait_on_rreg(
        __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], 0);
    __trans_tmp_1->virt.caps
        ? amdgpu_sriov_wreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr)
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr);
    __trans_tmp_1->virt.caps
        ? amdgpu_sriov_wreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr)
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr);
    vcn_v5_0_0_mc_resume_offset = hdr->ucode_size_bytes;
    __trans_tmp_1->virt.caps
        ? amdgpu_sriov_wreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              3)
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              0);
    __trans_tmp_1->virt.caps
        ? amdgpu_sriov_wreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr)
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1],
              __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr);
    __trans_tmp_1->virt.caps
        ? amdgpu_sriov_wreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
              __trans_tmp_1->gfx.gb_addr_config)
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0);
    tmp_ = __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1];
    amdgpu_device_wait_on_rreg(
        __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
        __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]);
    amdgpu_device_wait_on_rreg(
        __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_);
    for (vcn_v5_0_0_start_j = 0; vcn_v5_0_0_start_j < 10;) {
      vcn_v5_0_0_start_status = __trans_tmp_1 && __trans_tmp_1;
      amdgpu_device_wait_on_rreg(
          __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
          __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]);
      amdgpu_device_wait_on_rreg(
          __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_);
      vcn_v5_0_0_start_r = 1;
    }
    tmp_ = __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1];
    amdgpu_device_wait_on_rreg(
        __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
        __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]);
    amdgpu_device_wait_on_rreg(
        __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_);
    vcn_v5_0_0_start_ring =
        __trans_tmp_1->vcn.inst[vcn_v5_0_0_start_i].ring_enc[0];
    __trans_tmp_1->virt.caps ? __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i]
    : __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i][1]
        ? amdgpu_sriov_wreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
              vcn_v5_0_0_start_tmp)
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0);
    vcn_v5_0_0_start_ring_0 ? __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i]
    : __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i]
        ? __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i]
    : __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i]
        ? amdgpu_sriov_wreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1],
              vcn_v5_0_0_start_tmp)
        : amdgpu_device_wait_on_rreg(
              __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0);
  }
}

Based on this reproducer, PowerPC performs much worse compared to several other architectures:

$ clang --target=aarch64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i
vcn_v5_0_0.i:355:6: warning: stack frame size (128) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than]
  355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) {
      |      ^
1 warning generated.

$ clang --target=loongarch64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i
vcn_v5_0_0.i:355:6: warning: stack frame size (208) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than]
  355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) {
      |      ^
1 warning generated.

$ clang --target=powerpc64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i
vcn_v5_0_0.i:355:6: warning: stack frame size (2064) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than]
  355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) {
      |      ^
1 warning generated.

$ clang --target=s390x-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i
vcn_v5_0_0.i:355:6: warning: stack frame size (192) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than]
  355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) {
      |      ^
1 warning generated.

$ clang --target=x86_64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i
vcn_v5_0_0.i:355:6: warning: stack frame size (72) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than]
  355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) {
      |      ^
1 warning generated.

I've tentatively marked this as an issue with the PowerPC backend.

llvmbot commented 2 months ago

@llvm/issue-subscribers-backend-powerpc

Author: Nathan Chancellor (nathanchance)

A recent change in Fedora's configuration to build certain files in the `drivers/gpu/drm` folder of the Linux kernel with `-Werror` revealed an instance of `-Wframe-larger-than` in [`drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c`](https://elixir.bootlin.com/linux/v6.10-rc7/source/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c) that appears only when building for PowerPC. It shows up with `-fsanitize=array-bounds`, which becomes apparent why from the reproducer that I got from `cvise`: ```c enum { true } typedef bool; struct amdgpu_gfx_config { unsigned gb_addr_config; }; struct common_firmware_header { int ucode_size_bytes; }; enum { AMDGPU_UCODE_ID_VCN, AMDGPU_UCODE_ID_MAXIMUM }; enum amdgpu_firmware_load_type { AMDGPU_FW_LOAD_PSP }; struct amdgpu_firmware_info { int tmr_mc_addr_lo; int tmr_mc_addr_hi; }; struct amdgpu_firmware { struct amdgpu_firmware_info ucode[AMDGPU_UCODE_ID_MAXIMUM]; enum amdgpu_firmware_load_type load_type; }; struct amdgpu_vcn_fw_shared { void *cpu_addr; }; struct amdgpu_vcn_inst { long gpu_addr; int ring_enc[3]; void *dpg_sram_cpu_addr; int *dpg_sram_curr_addr; struct amdgpu_vcn_fw_shared fw_shared; }; struct amdgpu_vcn { int fw[4]; struct amdgpu_vcn_inst inst[4]; unsigned harvest_config; }; struct amdgpu_fw_shared_unified_queue_struct { char queue_mode; } *vcn_v5_0_0_start_fw_shared; struct amdgpu_virt { int caps; } amdgpu_sriov_wreg(int, int); enum amd_hw_ip_block_type { VCN_HWIP, MAX_HWIP }; struct amdgpu_device { int pg_flags; struct amdgpu_gfx_config gfx; struct amdgpu_vcn vcn; struct amdgpu_firmware firmware; int *reg_offset[MAX_HWIP][4]; struct amdgpu_virt virt; } vcn_v5_0_0_mc_resume_dpg_mode_adev, *vcn_v5_0_0_disable_static_power_gating_adev, *vcn_v5_0_0_start_dpg_mode_adev; int vcn_v5_0_0_mc_resume_offset, vcn_v5_0_0_mc_resume_inst, vcn_v5_0_0_mc_resume_dpg_mode_inst_idx, vcn_v5_0_0_mc_resume_dpg_mode_offset, vcn_v5_0_0_mc_resume_dpg_mode_size, vcn_v5_0_0_disable_static_power_gating_adev_6_0, vcn_v5_0_0_disable_static_power_gating_inst, vcn_v5_0_0_start_dpg_mode_inst_idx, vcn_v5_0_0_start_dpg_mode_ring, vcn_v5_0_0_start_dpg_mode_ring_3, vcn_v5_0_0_start_dpg_mode_tmp, vcn_v5_0_0_start_dpg_mode_internal_reg_offset, vcn_v5_0_0_start_dpg_mode_addr, vcn_v5_0_0_start_tmp, vcn_v5_0_0_start_i, vcn_v5_0_0_start_j, vcn_v5_0_0_start_r, vcn_v5_0_0_start_status, vcn_v5_0_0_start_ring; char (*vcn_v5_0_0_start_dpg_mode_adev_5_0)(struct amdgpu_device *, enum amd_hw_ip_block_type, char); char vcn_v5_0_0_start_dpg_mode___trans_tmp_15, vcn_v5_0_0_start_dpg_mode___trans_tmp_14, vcn_v5_0_0_start_dpg_mode___trans_tmp_13, vcn_v5_0_0_start_dpg_mode___trans_tmp_12, vcn_v5_0_0_start_dpg_mode___trans_tmp_11, vcn_v5_0_0_start_dpg_mode___trans_tmp_10, vcn_v5_0_0_start_dpg_mode___trans_tmp_9, vcn_v5_0_0_start_dpg_mode___trans_tmp_8, vcn_v5_0_0_start_dpg_mode___trans_tmp_7, vcn_v5_0_0_start_dpg_mode___trans_tmp_6, vcn_v5_0_0_start_dpg_mode___trans_tmp_5, vcn_v5_0_0_start_dpg_mode___trans_tmp_4, vcn_v5_0_0_start_dpg_mode___trans_tmp_2; bool vcn_v5_0_0_start_dpg_mode_aon_range; long vcn_v5_0_0_start_ring_0; void amdgpu_device_wait_on_rreg(int, int); int amdgpu_device_rreg(struct amdgpu_device *, int); static int vcn_v5_0_0_start_dpg_mode() { struct amdgpu_fw_shared_unified_queue_struct *fw_shared = vcn_v5_0_0_start_dpg_mode_adev->vcn .inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .fw_shared.cpu_addr; vcn_v5_0_0_start_dpg_mode_tmp = vcn_v5_0_0_start_dpg_mode_adev ? vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1] : 0; amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode_adev->vcn .inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_cpu_addr; vcn_v5_0_0_start_dpg_mode___trans_tmp_2 ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], vcn_v5_0_0_start_dpg_mode_tmp) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode___trans_tmp_4 ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; }); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode___trans_tmp_5 = vcn_v5_0_0_start_dpg_mode___trans_tmp_6; vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode___trans_tmp_7 ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 1 | ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx] [1]; }) << 4) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; }); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode___trans_tmp_8 = vcn_v5_0_0_start_dpg_mode___trans_tmp_9; vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], vcn_v5_0_0_start_dpg_mode_tmp) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode___trans_tmp_10 ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; }); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode_tmp; struct common_firmware_header *hdr = (struct common_firmware_header *)&vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn .fw[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx]; vcn_v5_0_0_mc_resume_dpg_mode_size = vcn_v5_0_0_mc_resume_dpg_mode_offset; vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn .inst[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx] .dpg_sram_curr_addr = 0; *vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn .inst[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx] .dpg_sram_curr_addr = vcn_v5_0_0_mc_resume_dpg_mode_adev.vcn .inst[vcn_v5_0_0_mc_resume_dpg_mode_inst_idx] .gpu_addr + hdr->ucode_size_bytes; vcn_v5_0_0_start_dpg_mode___trans_tmp_11 = vcn_v5_0_0_start_dpg_mode___trans_tmp_12 = 0; vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1] = vcn_v5_0_0_start_dpg_mode_aon_range; }); *vcn_v5_0_0_start_dpg_mode_adev->vcn.inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .dpg_sram_curr_addr = vcn_v5_0_0_start_dpg_mode___trans_tmp_13 = vcn_v5_0_0_start_dpg_mode___trans_tmp_14; vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 2) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode___trans_tmp_15 = vcn_v5_0_0_start_dpg_mode_adev_5_0( vcn_v5_0_0_start_dpg_mode_adev, VCN_HWIP, vcn_v5_0_0_start_dpg_mode_inst_idx); vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]; })) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], ({ vcn_v5_0_0_start_dpg_mode_addr = vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx] [1]; if (vcn_v5_0_0_start_dpg_mode_addr) vcn_v5_0_0_start_dpg_mode_internal_reg_offset = vcn_v5_0_0_start_dpg_mode_addr; vcn_v5_0_0_start_dpg_mode_addr; })); vcn_v5_0_0_start_dpg_mode_ring = vcn_v5_0_0_start_dpg_mode_adev->vcn .inst[vcn_v5_0_0_start_dpg_mode_inst_idx] .ring_enc[0] ? vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1] : amdgpu_device_rreg( vcn_v5_0_0_start_dpg_mode_adev, vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx] [1]); vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], vcn_v5_0_0_start_dpg_mode_tmp) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode_adev ? vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1] : amdgpu_device_rreg( vcn_v5_0_0_start_dpg_mode_adev, vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]); vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], vcn_v5_0_0_start_dpg_mode_tmp) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); vcn_v5_0_0_start_dpg_mode_adev ? vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1] : amdgpu_device_rreg( vcn_v5_0_0_start_dpg_mode_adev, vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]); vcn_v5_0_0_start_dpg_mode_adev ? vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1] : amdgpu_device_rreg( vcn_v5_0_0_start_dpg_mode_adev, vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1]); vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], vcn_v5_0_0_start_dpg_mode_tmp) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); fw_shared->queue_mode &= 0; vcn_v5_0_0_start_dpg_mode_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], vcn_v5_0_0_start_dpg_mode_ring_3) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_start_dpg_mode_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_start_dpg_mode_inst_idx][1], 0); return 0; } void vcn_v5_0_0_start(struct amdgpu_device *adev) { for (vcn_v5_0_0_start_i = 0;;) { vcn_v5_0_0_start_fw_shared = adev->vcn.inst[vcn_v5_0_0_start_i].fw_shared.cpu_addr; if (adev->pg_flags) { vcn_v5_0_0_start_dpg_mode(); continue; } amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst] [1], 0); vcn_v5_0_0_disable_static_power_gating_adev ? amdgpu_sriov_wreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0); amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst] [1], 4); vcn_v5_0_0_disable_static_power_gating_adev_6_0 ? amdgpu_sriov_wreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0); amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst] [1], 5); vcn_v5_0_0_disable_static_power_gating_adev_6_0 ? amdgpu_sriov_wreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0); vcn_v5_0_0_disable_static_power_gating_adev_6_0 ? amdgpu_sriov_wreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0); amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst] [1], 0); vcn_v5_0_0_disable_static_power_gating_adev_6_0 ? amdgpu_sriov_wreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0); amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst] [1], 0); vcn_v5_0_0_disable_static_power_gating_adev_6_0 ? amdgpu_sriov_wreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0); amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst] [1], 0); vcn_v5_0_0_disable_static_power_gating_adev_6_0 ? amdgpu_sriov_wreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0) : amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP] [vcn_v5_0_0_disable_static_power_gating_inst][1], 0); amdgpu_device_wait_on_rreg( vcn_v5_0_0_disable_static_power_gating_adev ->reg_offset[VCN_HWIP][vcn_v5_0_0_disable_static_power_gating_inst] [1], 0); adev->virt.caps ? adev->reg_offset[vcn_v5_0_0_start_i] : adev->reg_offset[vcn_v5_0_0_start_i][1] ? amdgpu_sriov_wreg(adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], vcn_v5_0_0_start_tmp) : amdgpu_device_wait_on_rreg( adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0); int tmp_ = adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]; amdgpu_device_wait_on_rreg( adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]); amdgpu_device_wait_on_rreg( adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]); tmp_ = adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]; amdgpu_device_wait_on_rreg( adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_); adev->virt.caps ? adev->reg_offset[vcn_v5_0_0_start_i] : adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i]? : amdgpu_device_wait_on_rreg( adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0); adev->virt.caps ? adev->reg_offset[vcn_v5_0_0_start_i] : adev->reg_offset[vcn_v5_0_0_start_i][1] ? amdgpu_sriov_wreg(adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0) : amdgpu_device_wait_on_rreg( adev->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0); struct amdgpu_device *__trans_tmp_1 = adev; struct common_firmware_header *hdr = (struct common_firmware_header *)&adev->vcn .fw[vcn_v5_0_0_mc_resume_inst]; __trans_tmp_1->virt.caps ? __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst] : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst] .tmr_mc_addr_lo); __trans_tmp_1->virt.caps ? __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst] .tmr_mc_addr_hi : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->firmware.ucode[vcn_v5_0_0_mc_resume_inst] .tmr_mc_addr_hi); amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], 0); __trans_tmp_1->virt.caps ? amdgpu_sriov_wreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr) : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr); __trans_tmp_1->virt.caps ? amdgpu_sriov_wreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr) : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr); vcn_v5_0_0_mc_resume_offset = hdr->ucode_size_bytes; __trans_tmp_1->virt.caps ? amdgpu_sriov_wreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], 3) : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], 0); __trans_tmp_1->virt.caps ? amdgpu_sriov_wreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr) : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_mc_resume_inst][1], __trans_tmp_1->vcn.inst[vcn_v5_0_0_mc_resume_inst].gpu_addr); __trans_tmp_1->virt.caps ? amdgpu_sriov_wreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], __trans_tmp_1->gfx.gb_addr_config) : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0); tmp_ = __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]; amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]); amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_); for (vcn_v5_0_0_start_j = 0; vcn_v5_0_0_start_j < 10;) { vcn_v5_0_0_start_status = __trans_tmp_1 && __trans_tmp_1; amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]); amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_); vcn_v5_0_0_start_r = 1; } tmp_ = __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]; amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1]); amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], tmp_); vcn_v5_0_0_start_ring = __trans_tmp_1->vcn.inst[vcn_v5_0_0_start_i].ring_enc[0]; __trans_tmp_1->virt.caps ? __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i] : __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i][1] ? amdgpu_sriov_wreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], vcn_v5_0_0_start_tmp) : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0); vcn_v5_0_0_start_ring_0 ? __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i] : __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i] ? __trans_tmp_1->reg_offset[vcn_v5_0_0_start_i] : __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i] ? amdgpu_sriov_wreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], vcn_v5_0_0_start_tmp) : amdgpu_device_wait_on_rreg( __trans_tmp_1->reg_offset[VCN_HWIP][vcn_v5_0_0_start_i][1], 0); } } ``` Based on this reproducer, PowerPC performs much worse compared to several other architectures: ``` $ clang --target=aarch64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i vcn_v5_0_0.i:355:6: warning: stack frame size (128) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than] 355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) { | ^ 1 warning generated. $ clang --target=loongarch64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i vcn_v5_0_0.i:355:6: warning: stack frame size (208) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than] 355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) { | ^ 1 warning generated. $ clang --target=powerpc64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i vcn_v5_0_0.i:355:6: warning: stack frame size (2064) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than] 355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) { | ^ 1 warning generated. $ clang --target=s390x-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i vcn_v5_0_0.i:355:6: warning: stack frame size (192) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than] 355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) { | ^ 1 warning generated. $ clang --target=x86_64-linux-gnu -O2 -fsanitize=array-bounds -Wframe-larger-than=1 -c -o /dev/null vcn_v5_0_0.i vcn_v5_0_0.i:355:6: warning: stack frame size (72) exceeds limit (1) in 'vcn_v5_0_0_start' [-Wframe-larger-than] 355 | void vcn_v5_0_0_start(struct amdgpu_device *adev) { | ^ 1 warning generated. ``` I've tentatively marked this as an issue with the PowerPC backend.
nathanchance commented 2 months ago

This was also reported at https://bugzilla.redhat.com/show_bug.cgi?id=2296499, cc @tstellar @nikic

nathanchance commented 2 months ago

Perhaps this is similar to https://github.com/ClangBuiltLinux/linux/issues/2014 and its solution?

chenzheng1030 commented 2 months ago

Let me clarify the issue first. From https://godbolt.org/z/7Ed5nbrbb, clang trunk is able to compile with -Wframe-larger-than=2048 for ppc64le-unknown-linux-gnu. Do you mean that clang for ppc64le uses larger stack memory than other platforms does? The trunk clang uses 1728 bytes.

chenzheng1030 commented 2 months ago

Register allocation uses too many stack slot to do spill/reload.

  194 regalloc                     - Number of spill slots allocated
chenzheng1030 commented 2 months ago

Too many global variables(generated by -fsanitize=array-bounds ) are hoisted out of the loop because global variables accesses are all marked as reMaterializable on PPC. The reMaterializable instructions are hoisted without considering the register pressure. so the liveranges for these global variables become quite large and lead to bad register allocations result.

  630 machinelicm                  - Number of machine instructions hoisted out of loop
chenzheng1030 commented 2 months ago
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index d2195cfbdc5c..ed4a2b943383 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1089,9 +1089,9 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(
   case PPC::LIS:
   case PPC::LIS8:
   case PPC::ADDIStocHA:
-  case PPC::ADDIStocHA8:
+  //case PPC::ADDIStocHA8:
   case PPC::ADDItocL:
-  case PPC::ADDItocL8:
+  //case PPC::ADDItocL8:
   case PPC::LOAD_STACK_GUARD:
   case PPC::PPCLdFixedAddr:
   case PPC::XXLXORz:

This can make the stack usage reduce to 208

chenzheng1030 commented 2 months ago
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index d2195cfbdc5c..ed4a2b943383 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1089,9 +1089,9 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(
   case PPC::LIS:
   case PPC::LIS8:
   case PPC::ADDIStocHA:
-  case PPC::ADDIStocHA8:
+  //case PPC::ADDIStocHA8:
   case PPC::ADDItocL:
-  case PPC::ADDItocL8:
+  //case PPC::ADDItocL8:
   case PPC::LOAD_STACK_GUARD:
   case PPC::PPCLdFixedAddr:
   case PPC::XXLXORz:

This can make the stack usage reduce to 208

This is an evidence for the root cause. Making ADDIStocHA8 and ADDItocL8 as ReMaterializable is correct. We need to find out why RA does not pull down the ReMaterializable instructions before it allocates so many spill slots.

chenzheng1030 commented 2 months ago

Hi @nathanchance Thanks for reporting this issue. We may need sometime to fix it since we need to check the register allocation. If you are blocked by this issue, you can try with option -mllvm -disable-machine-licm. With this option, the stack size is comparable with other platforms. See https://godbolt.org/z/8TeP84E5j

chenzheng1030 commented 1 month ago

Hi @qcolombet do you have any insights about why Greedy Register Allocator does not pull down below reMaterializable instructions like MachineLICM assumes? Because Greedy Register Allocator does not pull down these instructions as expected, so many registers spills/reloads happen. Thank you!

  %1388:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @155
  %1389:g8rc = ADDItocL8 %1388:g8rc_and_g8rc_nox0, @155
  %1404:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @157
  %1405:g8rc = ADDItocL8 %1404:g8rc_and_g8rc_nox0, @157
  %1427:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @159
  %1428:g8rc = ADDItocL8 %1427:g8rc_and_g8rc_nox0, @159
  %1416:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @158
  %1417:g8rc = ADDItocL8 %1416:g8rc_and_g8rc_nox0, @158
  %1400:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @156
  %1401:g8rc = ADDItocL8 %1400:g8rc_and_g8rc_nox0, @156
  %1388:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @155
  %1389:g8rc = ADDItocL8 %1388:g8rc_and_g8rc_nox0, @155

These are the common medium code model pattern to get the address of the global variable @155 on PPC. ADDIStocHA8 and ADDItocL8 are both rematerializable instructions.

nemanjai commented 1 month ago

@chenzheng1030 This may be a limitation due to the two-instruction sequence to access the address. Perhaps it is worthwhile experimenting with providing a single-instruction pseudo to accomplish this which would be rematerializable.

chenzheng1030 commented 1 month ago

@chenzheng1030 This may be a limitation due to the two-instruction sequence to access the address. Perhaps it is worthwhile experimenting with providing a single-instruction pseudo to accomplish this which would be rematerializable.

Thanks. We hit similar issue with big imm load before. In that case, some instructions from remat the big imm load do not have isReMaterializable flag. For that case, the solution is to use a pseudo isReMaterializable instruction I will check the issue too. BTW, -mcmodel=small also makes the RA work worse, but LDToc from small code model is not a rematerializable instruction.

qcolombet commented 1 month ago

Hi @qcolombet do you have any insights about why Greedy Register Allocator does not pull down below reMaterializable instructions like MachineLICM assumes? Because Greedy Register Allocator does not pull down these instructions as expected, so many registers spills/reloads happen. Thank you!

  %1388:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @155
  %1389:g8rc = ADDItocL8 %1388:g8rc_and_g8rc_nox0, @155
  %1404:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @157
  %1405:g8rc = ADDItocL8 %1404:g8rc_and_g8rc_nox0, @157
  %1427:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @159
  %1428:g8rc = ADDItocL8 %1427:g8rc_and_g8rc_nox0, @159
  %1416:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @158
  %1417:g8rc = ADDItocL8 %1416:g8rc_and_g8rc_nox0, @158
  %1400:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @156
  %1401:g8rc = ADDItocL8 %1400:g8rc_and_g8rc_nox0, @156
  %1388:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @155
  %1389:g8rc = ADDItocL8 %1388:g8rc_and_g8rc_nox0, @155

These are the common medium code model pattern to get the address of the global variable @155 on PPC. ADDIStocHA8 and ADDItocL8 are both rematerializable instructions.

Sorry just seeing this. I'll have a quick look.

In the future, couple of things:

qcolombet commented 1 month ago

@chenzheng1030 This may be a limitation due to the two-instruction sequence to access the address. Perhaps it is worthwhile experimenting with providing a single-instruction pseudo to accomplish this which would be rematerializable.

I haven't looked at the log yet, but if indeed the thing that needs to be rematerialized is more than one instruction at a time, RA as it stands, won't do it. In other words, at this point a pseudo is better.

chenzheng1030 commented 1 month ago

Thanks very much for having a check @qcolombet .

Here is a reduced case:

int a1;
int a2;
int a3;
int a4;
int a5;
int a6;
int a7;
int a8;
int a9;
int a10;
int a11;
int a12;
int a13;
int a14;
int a15;
int a16;
int a17;
int a18;
int a19;
int a20;
int a21;
int a22;
int a23;
int a24;
int a25;
int a26;
int a27;
int a28;
int a29;
int a30;
int a31;
int a32;

int a33;
int a34;
int a35;
int a36;

extern int bar1(int a);
extern int bar2(int* a);
int foo(int *arr1, int *arr2, int *arr3, int *arr4, int *arr5, int *arr6, int *arr7, int *arr8, int *arr9, int count) {
  int sum = 0;

  for (int i = 0; i < count; i++)
    sum += bar2(&a1) * arr1[i] + bar2(&a2) * arr1[i-1] + bar2(&a3) * arr1[i-2] + bar2(&a4) * arr1[i-3] +
           bar2(&a5) * arr2[i] + bar2(&a6) * arr2[i-1] + bar2(&a7) * arr2[i-2] + bar2(&a8) * arr2[i-3] +
           bar2(&a9) * arr3[i] + bar2(&a10) * arr3[i-1] + bar2(&a11) * arr3[i-2] + bar2(&a12) * arr3[i-3] +
           bar2(&a13) * arr4[i] + bar2(&a14) * arr4[i-1] + bar2(&a15) * arr4[i-2] + bar2(&a16) * arr4[i-3] +
           bar2(&a17) * arr5[i] + bar2(&a18) * arr5[i-1] + bar2(&a19) * arr5[i-2] + bar2(&a20) * arr5[i-3] +
           bar2(&a21) * arr6[i] + bar2(&a22) * arr6[i-1] + bar2(&a23) * arr6[i-2] + bar2(&a24) * arr6[i-3] +
           bar2(&a25) * arr7[i] + bar2(&a26) * arr7[i-1] + bar2(&a27) * arr7[i-2] + bar2(&a28) * arr7[i-3] +
           bar2(&a29) * arr8[i] + bar2(&a30) * arr8[i-1] + bar2(&a31) * arr8[i-2] + bar2(&a32) * arr8[i-3] +
           bar2(&a33) * arr9[i] + bar2(&a34) * arr9[i-1] + bar2(&a35) * arr9[i-2] + bar2(&a36) * arr9[i-3];

  return bar1(sum);
}

Before RA, in the loop preheader,

480B      %46:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a1
512B      %53:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a2
544B      %59:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a3
548B      %47:g8rc = ADDItocL8 %46:g8rc_and_g8rc_nox0, @a1, implicit $x2 // medium code model to get address of @a1
552B      %54:g8rc = ADDItocL8 %53:g8rc_and_g8rc_nox0, @a2, implicit $x2
560B      %60:g8rc = ADDItocL8 %59:g8rc_and_g8rc_nox0, @a3, implicit $x2

576B      %65:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a4
592B      %66:g8rc = ADDItocL8 %65:g8rc_and_g8rc_nox0, @a4, implicit $x2
608B      %71:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a5
624B      %72:g8rc = ADDItocL8 %71:g8rc_and_g8rc_nox0, @a5, implicit $x2
640B      %78:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a6
656B      %79:g8rc = ADDItocL8 %78:g8rc_and_g8rc_nox0, @a6, implicit $x2
672B      %84:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a7
688B      %85:g8rc = ADDItocL8 %84:g8rc_and_g8rc_nox0, @a7, implicit $x2
704B      %90:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a8
720B      %91:g8rc = ADDItocL8 %90:g8rc_and_g8rc_nox0, @a8, implicit $x2

After RA, in loop preheader:

560B      %326:g8rc = ADDItocL8 %46:g8rc_and_g8rc_nox0, @a1, implicit $x2
568B      STD %326:g8rc, 0, %stack.1 :: (store (s64) into %stack.1)
576B      %327:g8rc = ADDItocL8 %53:g8rc_and_g8rc_nox0, @a2, implicit $x2
584B      STD %327:g8rc, 0, %stack.2 :: (store (s64) into %stack.2)
592B      %328:g8rc = ADDItocL8 %59:g8rc_and_g8rc_nox0, @a3, implicit $x2
600B      STD %328:g8rc, 0, %stack.3 :: (store (s64) into %stack.3)
...
848B      %109:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a11
864B      %336:g8rc = ADDItocL8 %109:g8rc_and_g8rc_nox0, @a11, implicit $x2
872B      STD %336:g8rc, 0, %stack.11 :: (store (s64) into %stack.11)
880B      %115:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a12
896B      %337:g8rc = ADDItocL8 %115:g8rc_and_g8rc_nox0, @a12, implicit $x2
904B      STD %337:g8rc, 0, %stack.12 :: (store (s64) into %stack.12)
912B      %121:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a13
928B      %338:g8rc = ADDItocL8 %121:g8rc_and_g8rc_nox0, @a13, implicit $x2
936B      STD %338:g8rc, 0, %stack.13 :: (store (s64) into %stack.13)

For each variable with medium code model, PPC needs two instructions to get its address, instruction ADDItocL8 and ADDIStocHA8 which are both ReMaterializable.

In the greedy RA,

If RA can not pull down two rematerializable instructions for a1, a2, a3, it can not match the assumption in MachineLICM. MachineLICM treats rematerializable instructions as always be profitable to hoist. These rematerializable instructions relies on RA can pull them down when needed. For a11, a12, a13, it seems more confused that RA spills directly for ADDIStocHA8 without pulling it down.

chenzheng1030 commented 1 month ago

For now, small code model uses only one instructions LDtoc. LDtoc is not a remat instructions for now thus, register allocator does not pull down LDtoc, so that there are many spills for the small code model. Making LDtoc as remat can make register allocator pull down the LDtoc as expected. Similarly a pseudo remat instruction for medium code model should work.

qcolombet commented 1 month ago

Alright, I had a quick look and indeed the issue is that the code sequence to rematerialize is more than one-instruction long.

To take an example, consider:

848B      %109:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a11
864B      %336:g8rc = ADDItocL8 %109:g8rc_and_g8rc_nox0, @a11, implicit $x2

The live-range of %109 is super short so it is not problematic wrt RA (i.e., its interference with the rest of the live-ranges are minimal), but it will be a problem for rematerialization. Coming back to that in a second.

So now, the long lived live-range %336 is the one that gets spilled. What happens is %336 is marked as rematerializable so the inline spiller tries to do that before spilling. However, %336 needs %109 to be rematerialized and since this one has a super short live-range, it is not available at the point where we try to rematerialize %336. RA is not smart enough to rematerialize a second instruction (e.g., to bring %109 in) and as a result, it fails to rematerialize %336 and we spill it.

The workaround would be to use a pseudo to have only one instruction to rematerialize. Ideally fixing RA would be best, but this is not going to happen anytime soon.

The alternatively would be smarter about MachineLICM, but that's also likely to be a heuristic hell!

chenzheng1030 commented 1 month ago

I see. Thanks for the explanation about why the pattern can not be rematerialized. Yep, I tried to add a new heuristic in MachineLICM for another case about big immediates loading with more than one instructions in a phabricator patch, the reviewer also said no more heuristic should be added in MachineLICM. The solution for that is to add a pseudo instruction to load big immediate on PPC.

So for this issue, we also need a pseudo instruction to load the content of a TOC entry for medium code model. I'll do this.

Another necessary fix is to make LDtoc/LWZtoc as remateriable, otherwise too many spills for above case with small code model too.