llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.81k stars 11.45k forks source link

High stack usage in Linux kernel's drivers/gpu/drm/omapdrm/dss/dispc.c #99265

Open nathanchance opened 1 month ago

nathanchance commented 1 month ago

Our continuous integration noticed two new instances of -Wframe-larger-than when building for ARCH=loongarch although it does not appear to be LoongArch specific after reducing the file down with cvise.

typedef short __u16;
typedef int __u32;
typedef __u16 u16;
typedef __u32 u32;
enum { true } typedef bool;
void _dev_printk(char);
struct device {
  struct device_node *of_node;
};
void *dev_get_drvdata(struct device *);
u32 __raw_readl(void *addr) { return *(u32 *)addr; }
void __raw_writel(u32 value, void *addr) { *(u32 *)addr = value; }
struct seq_file {
  void *private;
} seq_printf(struct seq_file *, ...);
bool of_property_read_bool(struct device_node *, char *);
struct platform_device {
  struct device dev;
} pm_runtime_disable(struct device *);
struct soc_device_attribute {};
enum omap_plane_id {
  OMAP_DSS_GFX,
  OMAP_DSS_VIDEO1,
  OMAP_DSS_VIDEO2,
  OMAP_DSS_VIDEO3,
  OMAP_DSS_WB
};
struct dss_device *dss_debugfs_create_file(struct dss_device *, char *, int(),
                                           void *);
u16 DISPC_OVL_BASE(enum omap_plane_id plane) {
  switch (plane) {
  case OMAP_DSS_GFX:
    return 128;
  case OMAP_DSS_VIDEO1:
    return 188;
  case OMAP_DSS_VIDEO2:
    return 76;
  case OMAP_DSS_VIDEO3:
    return 300;
  case OMAP_DSS_WB:
    return 0;
  }
  __builtin_unreachable();
}
u16 DISPC_ACCU2_0_OFFSET(enum omap_plane_id plane) {
  switch (plane) {
  case OMAP_DSS_VIDEO1:
    return 84;
  default:
    __builtin_unreachable();
  }
}
u16 DISPC_FIR_COEF_H2_OFFSET(enum omap_plane_id plane, u16 i) {
  switch (plane) {
  case OMAP_DSS_VIDEO1:
    return i * 8;
  default:
    __builtin_unreachable();
  }
}
u16 DISPC_FIR_COEF_HV2_OFFSET(enum omap_plane_id plane, u16 i) {
  switch (plane) {
  case OMAP_DSS_VIDEO1:
    return 1424 + i * 8;
  default:
    __builtin_unreachable();
  }
}
u16 DISPC_FIR_COEF_V2_OFFSET(enum omap_plane_id plane, u16 i) {
  switch (plane) {
  case OMAP_DSS_WB:
    return 224 + i * 4;
  default:
    __builtin_unreachable();
  }
}
struct dispc_device {
  struct dss_device *dss;
  long core_clk_rate;
  long tv_pclk_rate;
  u32 ctx[1024];
  u32 *gamma_table[4];
  struct dispc_features *feat;
};
void dispc_write_reg(struct dispc_device *dispc, u16 idx, u32 val) {
  __raw_writel(val, dispc + idx);
}
u32 dispc_read_reg(struct dispc_device *dispc, u16 idx) {
  return __raw_readl(dispc + idx);
}
static void dispc_save_context(struct dispc_device *dispc) {
  int i, j;
  for (i = 0;; i++) {
    for (j = 0; j; j++)
      ;
    {
      for (; j < 8; j++)
        dispc->ctx[DISPC_OVL_BASE(i) + j] = dispc_read_reg(
            dispc, DISPC_OVL_BASE(i) + DISPC_FIR_COEF_H2_OFFSET(i, j));
      for (j = 0; j < 8; j++)
        dispc->ctx[DISPC_OVL_BASE(i) + DISPC_FIR_COEF_HV2_OFFSET(i, j)] =
            dispc_read_reg(dispc, i + DISPC_FIR_COEF_HV2_OFFSET(i, j));
      for (j = 0; j < 8; j++)
        dispc->ctx[DISPC_OVL_BASE(i) + DISPC_FIR_COEF_V2_OFFSET(i, j)] =
            dispc_read_reg(dispc, i + DISPC_FIR_COEF_V2_OFFSET(i, j));
    }
  }
}
static void dispc_restore_context(struct dispc_device *dispc) {
  int i, j;
  for (i = 0;; i++) {
    for (j = 0; j; j++)
      ;
    {
      dispc_write_reg(dispc, 0,
                      dispc->ctx[DISPC_OVL_BASE(i) + DISPC_ACCU2_0_OFFSET(i)]);
      dispc_write_reg(dispc, i, dispc->ctx[DISPC_OVL_BASE(i)]);
      for (; j < 8; j++)
        dispc_write_reg(dispc, i + DISPC_FIR_COEF_H2_OFFSET(i, j),
                        dispc->ctx[DISPC_OVL_BASE(i) + j]);
      for (j = 0; j < 8; j++)
        dispc_write_reg(
            dispc, DISPC_FIR_COEF_HV2_OFFSET(i, j),
            dispc->ctx[DISPC_OVL_BASE(i) + DISPC_FIR_COEF_HV2_OFFSET(i, j)]);
      for (j = 0; j < 8; j++)
        dispc_write_reg(
            dispc, i + DISPC_FIR_COEF_V2_OFFSET(i, j),
            dispc->ctx[DISPC_OVL_BASE(i) + DISPC_FIR_COEF_V2_OFFSET(i, j)]);
    }
  }
}
static int dispc_dump_regs(struct seq_file *s) {
  struct dispc_device *dispc = s->private;
  int i, j;
  const char *mgr_names[] = {};
  const char *ovl_names[] = {};
  const char **p_names = mgr_names;
  p_names = ovl_names;
  for (i = j = 0;;)
    seq_printf(s, p_names[i], dispc_read_reg(dispc, 0));
  return 0;
}
int dispc_errata_i734_wa_init(struct dispc_device *);
struct soc_device_attribute dispc_soc_devices[] = {};
int dispc_bind(struct device dev) {
  struct platform_device *pdev = 0;
  struct soc_device_attribute *soc;
  struct dss_device *dss;
  struct dispc_device *dispc;
  u32 rev;
  int r;
  struct device_node *np = dev.of_node;
  dispc = 0;
  soc = dispc_soc_devices;
  if (soc)
    ;
  r = dispc_errata_i734_wa_init(dispc);
  if (of_property_read_bool(np, ""))
    goto err_free;
  goto err_runtime_get;
  _dev_printk(rev);
  dss_debugfs_create_file(dss, "", dispc_dump_regs, dispc);
err_runtime_get:
  pm_runtime_disable(&pdev->dev);
err_free:
  return r;
}
int dispc_runtime_suspend(struct device *dev) {
  struct dispc_device *dispc = dev_get_drvdata(dev);
  dispc_save_context(dispc);
  return 0;
}
int dispc_runtime_resume(struct device *dev) {
  struct dispc_device *dispc = dev_get_drvdata(dev);
  dispc_restore_context(dispc);
  return 0;
}

With this reproducer, the stack usage is much higher than compared to GCC (which appears to use no stack at all):

$ clang --target=x86_64-linux-gnu -O2 -Wall -Wframe-larger-than=64 -c -o /dev/null dispc.i
dispc.i:168:5: warning: stack frame size (2152) exceeds limit (64) in 'dispc_runtime_suspend' [-Wframe-larger-than]
  168 | int dispc_runtime_suspend(struct device *dev) {
      |     ^
dispc.i:173:5: warning: stack frame size (2136) exceeds limit (64) in 'dispc_runtime_resume' [-Wframe-larger-than]
  173 | int dispc_runtime_resume(struct device *dev) {
      |     ^
2 warnings generated.

$ x86_64-linux-gcc -O2 -Wall -Wframe-larger-than=1 -c -o /dev/null dispc.i
$ clang --target=aarch64-linux-gnu -O2 -Wall -Wframe-larger-than=64 -c -o /dev/null dispc.i
dispc.i:168:5: warning: stack frame size (2048) exceeds limit (64) in 'dispc_runtime_suspend' [-Wframe-larger-than]
  168 | int dispc_runtime_suspend(struct device *dev) {
      |     ^
dispc.i:173:5: warning: stack frame size (1760) exceeds limit (64) in 'dispc_runtime_resume' [-Wframe-larger-than]
  173 | int dispc_runtime_resume(struct device *dev) {
      |     ^
2 warnings generated.

$ aarch64-linux-gcc -O2 -Wall -Wframe-larger-than=1 -c -o /dev/null dispc.i
$ clang --target=loongarch64-linux-gnu -O2 -Wall -Wframe-larger-than=64 -c -o /dev/null dispc.i
dispc.i:168:5: warning: stack frame size (2144) exceeds limit (64) in 'dispc_runtime_suspend' [-Wframe-larger-than]
  168 | int dispc_runtime_suspend(struct device *dev) {
      |     ^
dispc.i:173:5: warning: stack frame size (2112) exceeds limit (64) in 'dispc_runtime_resume' [-Wframe-larger-than]
  173 | int dispc_runtime_resume(struct device *dev) {
      |     ^
2 warnings generated.

$ loongarch64-linux-gcc -O2 -Wall -Wframe-larger-than=1 -c -o /dev/null dispc.i

The Linux kernel uses -Wframe-larger-than=2048. This warning appears not to occur with -Os.

dtcxzyw commented 1 month ago
static void dispc_restore_context(struct dispc_device *dispc) {
  int i, j;
  for (i = 0;; i++) {
    for (j = 0; j; j++)
      ;
    {

Is it over-reduced? It looks like an infinite loop :(

nathanchance commented 1 month ago

It looks like it :/ The original code seems to have proper terminating conditions. Is there a way I can avoid this with cvise (or something I could look for in the output to add to an interestingness test)?

v01dXYZ commented 1 month ago

With godbolt, it seems the problem is present with clang-18.0.1 on if we add a i < N to the loop for N a constant lower or equal to 1 << 16. With 17.0.1, N is rather 3. That happens too with a non-constant i < n.

The problematic IR is produced by SimplifyCFGPass . The CFG looks like:

Then, the register allocator produce too many spills, although going to the copy BB should kill the corresponding subset of values. Maybe liveness analysis do not grasp it.