RX570/POLARIS12 panic during GPU post on 13/stable aarch64

agrajag9 commented 3 years ago

When attempting to kldmod amdgpu, the system panics

FreeBSD version FreeBSD honeycomb 13.0-STABLE FreeBSD 13.0-STABLE #2 stable/13-n245851-02966cbdf03: Wed Jun 2 23:16:06 UTC 2021 agrajag9@honeycomb:/usr/obj/usr/src/arm64.aarch64/sys/GENERIC arm64

PCI Info

pciconf -lv

nvme0@pci2:1:0:0: class=0x010802 rev=0x00 hdr=0x00 vendor=0x144d device=0xa808 subvendor=0x144d subdevice=0xa801 vendor = 'Samsung Electronics Co Ltd' device = 'NVMe SSD Controller SM981/PM981/PM983' class = mass storage subclass = NVM vgapci0@pci4:1:0:0: class=0x030000 rev=0xc7 hdr=0x00 vendor=0x1002 device=0x699f subvendor=0x1da2 subdevice=0xe367 vendor = 'Advanced Micro Devices, Inc. [AMD/ATI]' device = 'Lexa PRO [Radeon 540/540X/550/550X / RX 540X/550/550X]' class = display subclass = VGA none0@pci4:1:0:1: class=0x040300 rev=0x00 hdr=0x00 vendor=0x1002 device=0xaae0 subvendor=0x1da2 subdevice=0xaae0 vendor = 'Advanced Micro Devices, Inc. [AMD/ATI]' device = 'Baffin HDMI/DP Audio [Radeon RX 550 640SP / RX 560/560X]' class = multimedia subclass = HDA

DRM KMOD version

drm-fbsd13-kmod 5.4.92.g20210419
drm-kmod g20190710_1

To Reproduce Steps to reproduce the behavior: kldload -v amdgpu

Additional context

root@honeycomb:~ # kldload -v amdgpu
<6>[drm] amdgpu kernel modesetting enabled.
drmn0: <drmn> on vgapci0
vgapci0: child drmn0 requested pci_enable_io
vgapci0: child drmn0 requested pci_enable_io
sysctl_warn_reuse: can't re-use a leaf (hw.dri.debug)!
<6>[drm] initializing kernel modesetting (POLARIS12 0x1002:0x699F 0x1DA2:0xE367 0xC7).
<6>[drm] register mmio base: 0x40000000
<6>[drm] register mmio size: 262144
<6>[drm] add ip block number 0 <vi_common>
<6>[drm] add ip block number 1 <gmc_v8_0>
<6>[drm] add ip block number 2 <tonga_ih>
<6>[drm] add ip block number 3 <gfx_v8_0>
<6>[drm] add ip block number 4 <sdma_v3_0>
<6>[drm] add ip block number 5 <powerplay>
<6>[drm] add ip block number 6 <dm>
<6>[drm] add ip block number 7 <uvd_v6_0>
<6>[drm] add ip block number 8 <vce_v3_0>
<6>[drm] UVD is enabled in VM mode
<6>[drm] UVD ENC is enabled in VM mode
<6>[drm] VCE enabled in VM mode
<6>[drm] GPU posting now...
[drm ERROR :atom_op_jump] atombios stuck in loop for more than 10secs aborting
[drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing AD44 (len 428, WS 20, PS 0) @ 0xAE76
[drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing A984 (len 158, WS 0, PS 8) @ 0xA9E7
drmn0: gpu post error!
drmn0: Fatal error during GPU init
<6>[drm] amdgpu: finishing device.
Warning: can't remove non-dynamic nodes (dri)!
device_attach: drmn0 attach returned 22
  x0:                b
  x1:                0
  x2:     ffffffffee00
  x3:               33
  x4:         40100401
  x5:    800208000aaaa
  x6:                1
  x7:            f5ff5
  x8:              130
  x9:                0
 x10:                0
 x11:         80130000
 x12:              427
 x13:                0
 x14:         80000000
 x15:         402bd5e1
 x16:         403cd89c
 x17:     ffffffffe540
 x18:                0
 x19:     ffffffffeb30
 x20:                0
 x21:           200bd5
 x22:                1
 x23:     ffffffffee13
 x24:                0
 x25:                1
 x26:           200b2a
 x27:           200c82
 x28:                1
 x29:     ffffffffea80
  sp:     ffffffffe550
  lr:           2110fc
 elr:         403cd8a4
spsr:         80000200
 far:                0
 esr:         bf000000
panic: Unhandled System Error
cpuid = 7
time = 1613602221
KDB: stack backtrace:
#0 0xffff000000443e6c at kdb_backtrace+0x60
#1 0xffff0000003ee0cc at vpanic+0x184
#2 0xffff0000003edf44 at panic+0x44
#3 0xffff0000007048ac at do_serror+0x40
#4 0xffff0000006e5c9c at handle_serror+0x88
Uptime: 10m52s

valpackett commented 3 years ago

Just to be sure, try CURRENT with manually built 5.4-lts. But looks like it might be a PCIe issue. Is this the early revision LX2160 or the newer one?

Also, why is it POSTing here, doesn't the UEFI include the QEMU package to run the VBIOS? Would be interesting to see what happens on an already-POSTed GPU.

(On my mcbin, POSTing POLARIS10 from the driver works fine too, but still)

agrajag9 commented 3 years ago

Interesting though. I haven't updated the firmware in a while, will pull a new BSP image from SR later today for testing.

I'm not sure why it seems to be POSTing so late actually, especially since it clearly already did and I typically live in in efifb just fine. Is it worth reflashing the GPU firmware with an arm64 blob? Currently my GPUs only have the vendor-installed x64 GOP driver, but adding the Amd64 GOP driver seems easy enough and might help? (https://www.workofard.com/2020/12/aarch64-option-roms-for-amd-gpus/).

In the meantime, another similar coredump with a different AMD GPU, although still POLARIS12.

Jun  9 10:23:11 honeycomb devd[79118]: notify_clients: send() failed; dropping unresponsive client
Jun  9 10:23:11 honeycomb kernel: anon_inodefs registered
Jun  9 10:23:11 honeycomb kernel: debugfs registered
Jun  9 10:23:11 honeycomb kernel: [drm] amdgpu kernel modesetting enabled.
Jun  9 10:23:11 honeycomb kernel: drmn0: <drmn> on vgapci0
Jun  9 10:23:11 honeycomb kernel: vgapci0: child drmn0 requested pci_enable_io
Jun  9 10:23:11 honeycomb syslogd: last message repeated 1 times
Jun  9 10:23:11 honeycomb kernel: sysctl_warn_reuse: can't re-use a leaf (hw.dri.debug)!
Jun  9 10:23:11 honeycomb kernel: [drm] initializing kernel modesetting (POLARIS12 0x1002:0x6995 0x1028:0x0B0C 0x00).
Jun  9 10:23:11 honeycomb kernel: [drm] register mmio base: 0x40000000
Jun  9 10:23:11 honeycomb kernel: [drm] register mmio size: 262144
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 0 <vi_common>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 1 <gmc_v8_0>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 2 <tonga_ih>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 3 <gfx_v8_0>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 4 <sdma_v3_0>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 5 <powerplay>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 6 <dm>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 7 <uvd_v6_0>
Jun  9 10:23:11 honeycomb kernel: [drm] add ip block number 8 <vce_v3_0>
Jun  9 10:23:21 honeycomb kernel: ATOM BIOS: 113-D0910602-101
Jun  9 10:23:21 honeycomb kernel: [drm] UVD is enabled in VM mode
Jun  9 10:23:21 honeycomb kernel: [drm] UVD ENC is enabled in VM mode
Jun  9 10:23:21 honeycomb kernel: [drm] VCE enabled in VM mode
Jun  9 10:23:21 honeycomb kernel: [drm] GPU posting now...
Jun  9 10:23:21 honeycomb kernel: [drm ERROR :atom_op_jump] atombios stuck in loop for more than 10secs aborting
Jun  9 10:23:21 honeycomb kernel: [drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing B25A (len 428, WS 20, PS 0) @ 0xB38C
Jun  9 10:23:21 honeycomb kernel: [drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing AE7A (len 158, WS 0, PS 8) @ 0xAEDD
Jun  9 10:23:21 honeycomb kernel: drmn0: gpu post error!
Jun  9 10:23:21 honeycomb kernel: drmn0: Fatal error during GPU init
Jun  9 10:23:21 honeycomb kernel: [drm] amdgpu: finishing device.
Jun  9 10:23:21 honeycomb kernel: Warning: can't remove non-dynamic nodes (dri)!
Jun  9 10:23:21 honeycomb kernel: device_attach: drmn0 attach returned 22
Jun  9 10:23:21 honeycomb kernel:   x0:               10

$ doas kgdb kernel.debug /var/crash/vmcore.last
Password:
GNU gdb (GDB) 10.2 [GDB v10.2 for FreeBSD]
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "aarch64-portbld-freebsd13.0".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from kernel.debug...

Unread portion of the kernel message buffer:
  x1:                0
  x2:     ffffffffee40
  x3:               33
  x4:         40100401
  x5:    800208000aaaa
  x6:                1
  x7:            f4f6f
  x8:              130
  x9:                0
 x10:                0
 x11:         80130000
 x12:              427
 x13:                0
 x14:         80000000
 x15:         402be5e1
 x16:         403ce990
 x17:     ffffffffe5a0
 x18:                0
 x19:     ffffffffeb90
 x20:                0
 x21:           200bd5
 x22:                1
 x23:     ffffffffee53
 x24:                0
 x25:                1
 x26:           200b2a
 x27:           200c82
 x28:                1
 x29:     ffffffffeae0
  sp:     ffffffffe5b0
  lr:           2110fc
 elr:         403ce998
spsr:         80000200
 far:                0
 esr:         bf000000
panic: Unhandled System Error
cpuid = 5
time = 1623234201
KDB: stack backtrace:
#0 0xffff000000448a0c at kdb_backtrace+0x60
#1 0xffff0000003f2224 at vpanic+0x184
#2 0xffff0000003f209c at panic+0x44
#3 0xffff000000712c28 at do_serror+0x40
#4 0xffff0000006f3494 at handle_serror+0x88
Uptime: 11m23s
Dumping 1269 out of 32157 MB:..1%..11%..21%

get_curthread () at /usr/src/sys/arm64/include/pcpu.h:68
68              __asm __volatile("ldr   %0, [x18]" : "=&r"(td));
(kgdb) backtrace
#0  get_curthread () at /usr/src/sys/arm64/include/pcpu.h:68
#1  doadump (textdump=<optimized out>) at /usr/src/sys/kern/kern_shutdown.c:399
#2  0xffff0000003f1d20 in kern_reboot (howto=260) at /usr/src/sys/kern/kern_shutdown.c:486
#3  0xffff0000003f22b4 in vpanic (fmt=<optimized out>, ap=...) at /usr/src/sys/kern/kern_shutdown.c:919
#4  0xffff0000003f20a0 in panic (fmt=0x0) at /usr/src/sys/kern/kern_shutdown.c:843
#5  0xffff000000712c2c in do_serror (frame=<optimized out>) at /usr/src/sys/arm64/arm64/trap.c:599
#6  0xffff0000006f3498 in handle_serror () at /usr/src/sys/arm64/arm64/exception.S:216
...
#5246 0xffff0000006f3498 in handle_serror () at /usr/src/sys/arm64/arm64/exception.S:216

pciconf -lv

vgapci0@pci4:1:0:0: class=0x030000 rev=0x00 hdr=0x00 vendor=0x1002 device=0x6995 subvendor=0x1028 subdevice=0x0b0c vendor = 'Advanced Micro Devices, Inc. [AMD/ATI]' device = 'Lexa XT [Radeon PRO WX 2100]' class = display subclass = VGA none0@pci4:1:0:1: class=0x040300 rev=0x00 hdr=0x00 vendor=0x1002 device=0xaae0 subvendor=0x1028 subdevice=0xaae0 vendor = 'Advanced Micro Devices, Inc. [AMD/ATI]' device = 'Baffin HDMI/DP Audio [Radeon RX 550 640SP / RX 560/560X]' class = multimedia subclass = HDA

valpackett commented 3 years ago

Is it worth reflashing the GPU firmware with an arm64 blob?

I don't think it is.

it clearly already did and I typically live in in efifb just fine

huh. Well then probably the same thing that causes the POST to fail also causes the driver to wrongly detect the GPU as uninitialized.

Again, is this the early revision LX2160 or the newer one? And please try CURRENT with manually built 5.4-lts from this repo.

agrajag9 commented 3 years ago

Yep, still the early hardware revision, but I haven't had any PCI problems since the last we talked about it.

Will test more with CURRENT GENERIC + 5.4-lts once it's all built.

agrajag9 commented 3 years ago

No dice:

FreeBSD honeycomb.a9development.com 14.0-CURRENT FreeBSD 14.0-CURRENT #0 main-n247275-aa310ebfba3: Thu Jun 10 14:01:13 UTC 2021     agrajag9@honeycomb.a9development.com:/usr/obj/usr/src/arm64.aarch64/sys/GENERIC  arm64

with drm_v5.4.92_4

# kldload -v amdgpu
anon_inodefs registered
debugfs registered
<6>[drm] amdgpu kernel modesetting enabled.
drmn0: <drmn> on vgapci0
vgapci0: child drmn0 requested pci_enable_io
vgapci0: child drmn0 requested pci_enable_io
sysctl_warn_reuse: can't re-use a leaf (hw.dri.debug)!
<6>[drm] initializing kernel modesetting (POLARIS12 0x1002:0x699F 0x1DA2:0xE367 0xC7).
<6>[drm] register mmio base: 0x40000000
<6>[drm] register mmio size: 262144
<6>[drm] add ip block number 0 <vi_common>
<6>[drm] add ip block number 1 <gmc_v8_0>
<6>[drm] add ip block number 2 <tonga_ih>
<6>[drm] add ip block number 3 <gfx_v8_0>
<6>[drm] add ip block number 4 <sdma_v3_0>
<6>[drm] add ip block number 5 <powerplay>
<6>[drm] add ip block number 6 <dm>
<6>[drm] add ip block number 7 <uvd_v6_0>
<6>[drm] add ip block number 8 <vce_v3_0>
<6>[drm] UVD is enabled in VM mode
<6>[drm] UVD ENC is enabled in VM mode
<6>[drm] VCE enabled in VM mode
<6>[drm] GPU posting now...
[drm ERROR :atom_op_jump] atombios stuck in loop for more than 10secs aborting
[drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing AD44 (len 428, WS 20, PS 0) @ 0xAE76
[drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing A984 (len 158, WS 0, PS 8) @ 0xA9E7
drmn0: gpu post error!
drmn0: Fatal error during GPU init
<6>[drm] amdgpu: finishing device.
Warning: can't remove non-dynamic nodes (dri)!
device_attach: drmn0 attach returned 22
  x0:                b
  x1:                0
  x2:     ffffffffee60
  x3:               13
  x4:         40100401
  x5:            80020
  x6:                1
  x7:            f4e3b
  x8:              130
  x9:                0
 x10:                0
 x11:         80130000
 x12:              427
 x13:         403c2000
 x14:         40805ee0
 x15:             2000
 x16:         402d3208
 x17:     ffffffffe5c0
 x18:               3f
 x19:     ffffffffeba8
 x20:                0
 x21:           100cd5
 x22:                1
 x23:     ffffffffee63
 x24:                0
 x25:                1
 x26:           100c2a
 x27:           100d82
 x28:                1
 x29:     ffffffffeb00
  sp:     ffffffffe5d0
  lr:           111234
 elr:         402d3210
spsr:         80000200
 far:                0
 esr:         bf000000
panic: Unhandled System Error
cpuid = 7
time = 1623343552
KDB: stack backtrace:
db_trace_self() at db_trace_self
db_trace_self_wrapper() at db_trace_self_wrapper+0x30
vpanic() at vpanic+0x184
panic() at panic+0x44
do_serror() at do_serror+0x40
handle_serror() at handle_serror+0x88
--- system error, esr 0xbf000000
KDB: enter: panic
[ thread pid 30094 tid 194022 ]
Stopped at      kdb_enter+0x44: undefined       f904411f

valpackett commented 3 years ago

There's this amdgpu.pcie_gen_cap=0x00040004 thing on e.g. this post is about. It forces PCIe gen3, by default gen 1/2 are also allowed. I thought it would be just some performance thing but I just found this tweet (also here):

I tracked down the amdgpu hang to the PCIe bus link scaling. We can force the link to PCIe Gen3 permanently and all other memory and clock scaling works flawlessly

Soooooo try hw.amdgpu.pcie_gen_cap=0x00040004 in kenv (e.g. /boot/loader.conf)?

agrajag9 commented 3 years ago

Of course the error is in a screenshot where it's not text-searchable...

Anyways, added the following to /boot/loader.conf.local but still panicking in the same way:

hw.amdgpu.pcie_gen_cap=0x00040004
hw.syscons.disable=1

Do we still need the syscons disable? I see issue 60 where you finally puzzled that out.

valpackett commented 3 years ago

Do we still need the syscons disable? I see issue 60 where you finally puzzled that out.

The fix is in #61 which is still unmerged :/ but you can apply that yourself (rebase the branch onto current 5.4-lts or cherry-pick the commit).

Also you didn't definitely need syscons disable, only if your efifb resolution was high enough that the memory overlapped. (I needed it for >=1440p)

To 100% make sure there's no weirdness with the tunable stuff, try doing this in code instead, changing https://github.com/freebsd/drm-kmod/blob/b45715c9a147c06fdc1c298cbcb52e1ebf9f81f3/drivers/gpu/drm/amd/include/amd_pcie.h#L43-L47 to #define AMDGPU_DEFAULT_PCIE_GEN_MASK (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3)

agrajag9 commented 3 years ago

.if ${.CURDIR:M*/graphics/drm-current-kmod}
EXTRA_PATCHES+= /distfiles/local-patches/graphics/drm-current-kmod.patch
.endif

--- drivers/gpu/drm/amd/include/amd_pcie.h.orig 2021-06-12 11:02:26.030476000 +0000
+++ drivers/gpu/drm/amd/include/amd_pcie.h      2021-06-12 11:03:24.635405000 +0000
@@ -40,10 +40,7 @@
 #define CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_SHIFT  0

 /* gen: chipset 1/2, asic 1/2/3 */
-#define AMDGPU_DEFAULT_PCIE_GEN_MASK (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 \
-                                     | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 \
-                                     | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 \
-                                     | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 \
+#define AMDGPU_DEFAULT_PCIE_GEN_MASK (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 \
                                      | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3)

 /* Following flags shows PCIe lane width switch supported in driver which are decided by chipset and ASIC */

=======================<phase: patch          >============================
===>  Patching for drm-current-kmod-5.4.92.g20210526
===>  Applying extra patch /distfiles/local-patches/graphics/drm-current-kmod.patch
===========================================================================

Same panic :(

It looks I'm still using a UEFI from a while ago and there may be some updates there. I'll drop a new one in and see if that helps as well...

agrajag9 commented 3 years ago

Tried with a fresh firmware build, same panic.

agrajag9 commented 3 years ago

Found this: https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/issues/62#issuecomment-433967041

The error "GPU posting now" appears when a secondary card is initialized that didn't get posted by the BIOS. You can enable more debug messages in the GPU driver with the kernel parameter drm.debug=0xff.

This is curious, because there shouldn't be another GPU attached?

And this: http://macchiatobin.net/forums/topic/gpu/#post-7368

Added hw.drm.debug=0xff to /boot/loader.conf.local but didn't see anything new in the trace.

agrajag9 commented 3 years ago

I also found some more in threads about IOMMU and iirc we do not support IOMMU (SMMU on Arm?) and I think that's something Jon was building into the firmware. Possibly that's an issue?

valpackett commented 3 years ago

Debug flag for us is hw.dri.drm_debug.

The error "GPU posting now" appears when a secondary card is initialized that didn't get posted by the BIOS

That's just one possible cause, absolutely not the only one. (Also "GPU posting now" on its own is not an error, if you don't have efifb it's expected.)

And this: http://macchiatobin.net/forums/topic/gpu/#post-7368

Oh, as you can see this post is already talking about OpenGL stuff, not early init. I've seen the corruption myself :) This was solved upstream some time ago, my backport was https://github.com/FreeBSDDesktop/kms-drm/pull/154/commits/7fe2f58bb359668a581547d68eac09516cf09768

we do not support IOMMU (SMMU on Arm?)

We support SMMU 3 since https://reviews.freebsd.org/D24618 but not SMMU 2. In any case it shouldn't be mandatory to use the IOMMU. Especially since you do have other PCIe cards working…

hmm hmm I wonder if the PCIe link gen is not being applied through LinuxKPI somehow

agrajag9 commented 3 years ago

Wrong button...

Latest acpidump -dt for you in case there's something wacky in there: https://gist.github.com/4cfb12e38d4f5845069d8f3f92c96fb6

valpackett commented 3 years ago

pciconf -lvbc might be more useful

valpackett commented 3 years ago

Oddly the AMDGPU_DEFAULT_PCIE_GEN_MASK is only applied for APUs (?? check is actually just pci_is_root_bus which would be true on the mcbin IIUC :D)

Try this

--- i/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ w/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4013,9 +4013,11 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
            adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
        if (adev->pm.pcie_mlw_mask == 0)
            adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
-       return;
+       // return;
    }

+   adev->pm.pcie_gen_mask = CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3;
+
    if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
        return;

agrajag9 commented 3 years ago

# pciconf -lvbc
nvme0@pci2:1:0:0:       class=0x010802 rev=0x00 hdr=0x00 vendor=0x144d device=0xa808 subvendor=0x144d subdevice=0xa801
    vendor     = 'Samsung Electronics Co Ltd'
    device     = 'NVMe SSD Controller SM981/PM981/PM983'
    class      = mass storage
    subclass   = NVM
    bar   [10] = type Memory, range 64, base 0x40000000, size 16384, enabled
    cap 01[40] = powerspec 3  supports D0 D3  current D0
    cap 05[50] = MSI supports 1 message, 64 bit
    cap 10[70] = PCI-Express 2 endpoint max data 128(256) FLR RO NS
                 max read 512
                 link x4(x4) speed 8.0(8.0) ASPM disabled(L1) ClockPM disabled
    cap 11[b0] = MSI-X supports 33 messages, enabled
                 Table in map 0x10[0x3000], PBA in map 0x10[0x2000]
    ecap 0001[100] = AER 2 0 fatal 0 non-fatal 0 corrected
    ecap 0003[148] = Serial 1 0000000000000000
    ecap 0004[158] = Power Budgeting 1
    ecap 0019[168] = PCIe Sec 1 lane errors 0
    ecap 0018[188] = LTR 1
    ecap 001e[190] = L1 PM Substates 1
vgapci0@pci4:1:0:0:     class=0x030000 rev=0xc7 hdr=0x00 vendor=0x1002 device=0x699f subvendor=0x1da2 subdevice=0xe367
    vendor     = 'Advanced Micro Devices, Inc. [AMD/ATI]'
    device     = 'Lexa PRO [Radeon 540/540X/550/550X / RX 540X/550/550X]'
    class      = display
    subclass   = VGA
    bar   [10] = type Prefetchable Memory, range 64, base 0xa400000000, size 268435456, enabled
    bar   [18] = type Prefetchable Memory, range 64, base 0xa410000000, size 2097152, enabled
    bar   [20] = type I/O Port, range 32, base 0, size 256, disabled
    bar   [24] = type Memory, range 32, base 0x40000000, size 262144, enabled
    cap 09[48] = vendor (length 8)
    cap 01[50] = powerspec 3  supports D0 D1 D2 D3  current D0
    cap 10[58] = PCI-Express 2 legacy endpoint max data 128(256) RO NS
                 max read 512
                 link x8(x8) speed 8.0(8.0) ASPM disabled(L1) ClockPM disabled
    cap 05[a0] = MSI supports 1 message, 64 bit
    ecap 000b[100] = Vendor [1] ID 0001 Rev 1 Length 16
    ecap 0001[150] = AER 2 0 fatal 0 non-fatal 1 corrected
    ecap 0015[200] = Resizable BAR 1
    ecap 0019[270] = PCIe Sec 1 lane errors 0
    ecap 000f[2b0] = ATS 1
    ecap 0013[2c0] = Page Page Request 1
    ecap 001b[2d0] = Process Address Space ID 1
    ecap 0018[320] = LTR 1
    ecap 000e[328] = ARI 1
    ecap 001e[370] = L1 PM Substates 1
none0@pci4:1:0:1:       class=0x040300 rev=0x00 hdr=0x00 vendor=0x1002 device=0xaae0 subvendor=0x1da2 subdevice=0xaae0
    vendor     = 'Advanced Micro Devices, Inc. [AMD/ATI]'
    device     = 'Baffin HDMI/DP Audio [Radeon RX 550 640SP / RX 560/560X]'
    class      = multimedia
    subclass   = HDA
    bar   [10] = type Memory, range 64, base 0x40040000, size 16384, enabled
    cap 09[48] = vendor (length 8)
    cap 01[50] = powerspec 3  supports D0 D1 D2 D3  current D0
    cap 10[58] = PCI-Express 2 legacy endpoint max data 128(256) RO NS
                 max read 512
                 link x8(x8) speed 8.0(8.0) ASPM disabled(L1) ClockPM disabled
    cap 05[a0] = MSI supports 1 message, 64 bit
    ecap 000b[100] = Vendor [1] ID 0001 Rev 1 Length 16
    ecap 0001[150] = AER 2 0 fatal 0 non-fatal 1 corrected
    ecap 000e[328] = ARI 1

=======================<phase: patch          >============================
===>  Patching for drm-current-kmod-5.4.92.g20210526
===>  Applying extra patch /distfiles/local-patches/graphics/drm-current-kmod/amdgpu_device_c.patch
===>  Applying extra patch /distfiles/local-patches/graphics/drm-current-kmod/amd_pcie_h.patch
===========================================================================

Same panic...

agrajag9 commented 3 years ago

I just noticed this:

cap 10[58] = PCI-Express 2 legacy endpoint

valpackett commented 3 years ago

Hm, very similar on my mcbin actually, even 1 more corrected error, but everything works

pciconf here

``` vgapci0@pci0:0:0:0: class=0x030000 rev=0xc7 hdr=0x00 vendor=0x1002 device=0x67df subvendor=0x1002 subdevice=0x0b37 vendor = 'Advanced Micro Devices, Inc. [AMD/ATI]' device = 'Ellesmere [Radeon RX 470/480/570/570X/580/580X/590]' class = display subclass = VGA bar [10] = type Prefetchable Memory, range 64, base 0x800000000, size 268435456, enabled bar [18] = type Prefetchable Memory, range 64, base 0x810000000, size 2097152, enabled bar [20] = type I/O Port, range 32, base 0, size 256, disabled bar [24] = type Memory, range 32, base 0xc0000000, size 262144, enabled cap 09[48] = vendor (length 8) cap 01[50] = powerspec 3 supports D0 D1 D2 D3 current D0 cap 10[58] = PCI-Express 2 legacy endpoint max data 128(256) RO NS max read 512 link x4(x16) speed 8.0(8.0) ASPM disabled(L1) cap 05[a0] = MSI supports 1 message, 64 bit ecap 000b[100] = Vendor [1] ID 0001 Rev 1 Length 16 ecap 0001[150] = AER 2 0 fatal 0 non-fatal 2 corrected ecap 0015[200] = Resizable BAR 1 ecap 0019[270] = PCIe Sec 1 lane errors 0 ecap 000f[2b0] = ATS 1 ecap 0013[2c0] = Page Page Request 1 ecap 001b[2d0] = Process Address Space ID 1 ecap 0018[320] = LTR 1 ecap 000e[328] = ARI 1 ecap 001e[370] = L1 PM Substates 1 ``` Changes after loading amdgpu: ```diff @@ -5,14 +5,14 @@ subclass = VGA bar [10] = type Prefetchable Memory, range 64, base 0x800000000, size 268435456, enabled bar [18] = type Prefetchable Memory, range 64, base 0x810000000, size 2097152, enabled - bar [20] = type I/O Port, range 32, base 0, size 256, disabled + bar [20] = type I/O Port, range 32, base 0, size 256, enabled bar [24] = type Memory, range 32, base 0xc0000000, size 262144, enabled cap 09[48] = vendor (length 8) cap 01[50] = powerspec 3 supports D0 D1 D2 D3 current D0 cap 10[58] = PCI-Express 2 legacy endpoint max data 128(256) RO NS max read 512 - link x4(x16) speed 8.0(8.0) ASPM disabled(L1) - cap 05[a0] = MSI supports 1 message, 64 bit + link x4(x16) speed 5.0(8.0) ASPM disabled(L1) + cap 05[a0] = MSI supports 1 message, 64 bit enabled with 1 message ecap 000b[100] = Vendor [1] ID 0001 Rev 1 Length 16 ecap 0001[150] = AER 2 0 fatal 0 non-fatal 2 corrected ecap 0015[200] = Resizable BAR 1 @@ -33,7 +33,7 @@ cap 01[50] = powerspec 3 supports D0 D1 D2 D3 current D3 cap 10[58] = PCI-Express 2 legacy endpoint max data 128(256) RO NS max read 512 - link x4(x16) speed 8.0(8.0) ASPM disabled(L1) + link x4(x16) speed 5.0(8.0) ASPM disabled(L1) cap 05[a0] = MSI supports 1 message, 64 bit ecap 000b[100] = Vendor [1] ID 0001 Rev 1 Length 16 ecap 0001[150] = AER 2 0 fatal 0 non-fatal 2 corrected ```

cap 10[58] = PCI-Express 2 legacy endpoint

That's just what the device is, that's static data IIUC. speed 8.0(8.0) is the indication that it's running at Gen 3 speed. (And yeah it will always be at that speed before the driver loads, so this was not very useful unfortunately…)

Seems like link speed renegotiation is initiated from the GPU side firmware, and that parameter in the driver should tell it what speeds to support, so there kinda shouldn't be differences between Linux and FreeBSD in terms of that stuff.

To be sure: have you tested under Linux?

agrajag9 commented 3 years ago

I have not had a chance to test under Linux, will try to get to that this week.

Meanwhile, I just saw this on the OpenBSD 6.9 release notes:

Fixed panics on the HoneyComb LX2K with amdgpu(4).

That got me to this commit: https://github.com/openbsd/src/commit/9e1dc75e2daa85a654de4a3aeb461dca3f921917

From bluerise on Discord:

That only helped a little. The one thing that helped a lot, but still doesn‘t fix all, was switching from WT (for write-combine mappings) to DEVICE.

valpackett commented 3 years ago

That got me to this commit: openbsd/src@9e1dc75

Huh.

Linux' iowrite32/ioread32 explicitly contain barriers

Well, our implementations do too.

Even though there is an odd "XXX This is all x86 specific" comment, it works fine on the MACCHIATObin so we can be sure there's nothing affecting arm64-in-general. Looking at the impl, ioread32 does readl which is

    __io_br(); // __compiler_membar() // __asm __volatile(" " : : : "memory")
    v = le32toh(__raw_readl(addr));
    __io_ar(); // rmb() ifdef rmb // defined as dmb(ld) in arm64/include/atomic.h

and that's how it is on Linux too.

switching from WT (for write-combine mappings) to DEVICE

https://reviews.freebsd.org/rS351693 ;)

agrajag9 commented 3 years ago

Sorry, forgot to answer - Yes, still the early board rev, but I haven't had problems with PCI in months.

agrajag9 commented 3 years ago

Confirmed the WX 2100 works just fine with drm-fbsd13-kmod-5.4.92.g20210419 in stable/13-n245876-088dbb4b8d3 on amd64. Waiting on some new USB drives in the mail so I can make a bootable Linux because somehow I ran out of spare thumb drives...

agrajag9 commented 3 years ago

And just to be sure I'm not doing something super dumb elsewhere, here's /boot/loader.conf.local:

boot_multicons="YES"
boot_serial="YES"
console="efi"
exec="gop set 0"
verbose_loading="YES"
boot_verbose="-v"

No kld lines in rc.conf

agrajag9 commented 3 years ago

Having an absolutely attrocious time getting Linux to run on this thing, but Fed34 gave me this:

[   51.047877] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx timeout, signaled seq=82, emitted seq=85
[   51.065702] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process gnome-shell pid 1188 thread gnome-shel:cs0 pid 1236
[   51.078119] amdgpu 0004:01:00.0: amdgpu: GPU reset begin!
[   51.555537] amdgpu 0004:01:00.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring kiq_2.1.0 test failed (-110)
[   51.566035] [drm:gfx_v8_0_hw_fini [amdgpu]] *ERROR* KCQ disable failed
[   51.819517] amdgpu: cp is busy, skip halt cp
[   52.070271] amdgpu: rlc is busy, skip halt rlc
[   52.075738] amdgpu 0004:01:00.0: amdgpu: BACO reset
[   52.611663] amdgpu 0004:01:00.0: amdgpu: GPU reset succeeded, trying to resume
[   52.620026] [drm] PCIE GART of 256M enabled (table at 0x000000F400E10000).
[   52.626909] [drm] VRAM is lost due to GPU reset!
[   52.871100] [drm] UVD and UVD ENC initialized successfully.
[   52.977077] [drm] VCE initialized successfully.
[   52.986042] amdgpu 0004:01:00.0: amdgpu: recover vram bo from shadow start
[   52.994516] amdgpu 0004:01:00.0: amdgpu: recover vram bo from shadow done
[   53.001310] [drm] Skip scheduling IBs!
[   53.005049] [drm] Skip scheduling IBs!
[   53.008901] [drm] Skip scheduling IBs!
[   53.008902] amdgpu 0004:01:00.0: amdgpu: GPU reset(2) succeeded!
[   53.018652] [drm] Skip scheduling IBs!
[   53.117794] fbcon: Taking over console
[   53.138909] Console: switching to colour frame buffer device 320x90
[  110.957884] [drm:amdgpu_dm_commit_planes [amdgpu]] *ERROR* Waiting for fences timed out!
[  121.447875] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx timeout, signaled seq=300, emitted seq=303
[  121.465868] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process gnome-shell pid 2050 thread gnome-shel:cs0 pid 2080
[  121.478280] amdgpu 0004:01:00.0: amdgpu: GPU reset begin!
[  121.955323] amdgpu 0004:01:00.0: [drm:amdgpu_ring_test_helper [amdgpu]] *ERROR* ring kiq_2.1.0 test failed (-110)
[  121.965824] [drm:gfx_v8_0_hw_fini [amdgpu]] *ERROR* KCQ disable failed
[  122.218874] amdgpu: cp is busy, skip halt cp
[  122.469444] amdgpu: rlc is busy, skip halt rlc
[  122.474906] amdgpu 0004:01:00.0: amdgpu: BACO reset
[  123.021687] amdgpu 0004:01:00.0: amdgpu: GPU reset succeeded, trying to resume
[  123.030047] [drm] PCIE GART of 256M enabled (table at 0x000000F400E10000).
[  123.036930] [drm] VRAM is lost due to GPU reset!
[  123.281082] [drm] UVD and UVD ENC initialized successfully.
[  123.387057] [drm] VCE initialized successfully.
[  123.396054] amdgpu 0004:01:00.0: amdgpu: recover vram bo from shadow start
[  123.404751] amdgpu 0004:01:00.0: amdgpu: recover vram bo from shadow done
[  123.411547] [drm] Skip scheduling IBs!
[  123.415285] [drm] Skip scheduling IBs!
[  123.419108] amdgpu 0004:01:00.0: amdgpu: GPU reset(4) succeeded!
[  123.419108] [drm] Skip scheduling IBs!

There's a lot more but the system hangs for arbitrary amounts of time and get significant graphical distotions.

Importantly I had to set arm-smmu.disable_bypass=0 on grub's linux line or it would hang somewhere during init while spamming:

[   27.276297] arm-smmu arm-smmu.0.auto: Blocked unknown Stream ID 0x4000; boot with "arm-smmu.disable_bypass=0" to allow, but this may have security implications
[   27.290536] arm-smmu arm-smmu.0.auto:        GFSR 0x80000002, GFSYNR0 0x00000008, GFSYNR1 0x00004000, GFSYNR2 0x00000000

If I can get it to behave, I'll be able to post cleaner, fuller output...

agrajag9 commented 3 years ago

Some more context: https://gist.github.com/agrajag9/b0c3722f472d4e8ef6f27c194fc2cf19

agrajag9 commented 3 years ago

Confirmed that glmark2 could run on Fedora with amdgpu drm. So it's definitely not the hardware...

agrajag9 commented 3 years ago

@unrelentingtech any cheap Nvidia GPUs you know of that I can test with just to be sure it's not the HC? Maybe this guy?

https://pcpartpicker.com/product/rtnG3C/msi-video-card-gt7101gd3hlp

Looks like it should work:

https://bsd-hardware.info/?id=pci:10de-128b-1462-8c93

valpackett commented 3 years ago

How are you gonna test an nvidia gpu on FreeBSD/aarch64? I'm pretty sure they only support FreeBSD/amd64. And we don't have nouveau here (I've… tried. It's hard.)

Actually, hmm, maybe test OpenBSD and NetBSD?

If OpenBSD doesn't work but NetBSD does, I'd be fairly confident that it's https://github.com/NetBSD/src/blob/trunk/sys/arch/arm/acpi/acpi_pci_layerscape_gen4.c doing the magic. I think I've tried porting that a while ago, look around in your email :D maybe retry my patch for that as well then.

agrajag9 commented 3 years ago

Putting this here for future reference - your port of NetBSD's acpi_pci_layerscape_gen4.c: https://github.com/DankBSD/base/commit/6cbdafbc310b9a0fa4d046f71979aa01302e8b0b

agrajag9 commented 3 years ago

Well, that didn't work...

pcib0: <NXP Layerscape PCI host controller> on acpi0
pcib0: Bus is cache-coherent
pcib0: ECAM for bus 1-255 at mem 9000100000-900fffffff
pcib0: found the nxp0016!pcib0: found the memory resource!pcib0: mapped the memory resource!pcib0: Controller revision 0x10
pci0: <PCI bus> on pcib0
pci0: domain=2, physical bus=1
  x0:                0
  x1: ffff00011ef00000
  x2:                2
  x3:               33
  x4:                2
  x5:                2
  x6:                f
  x7:              5a0
  x8: ffff000000952800
  x9: ffff0000006eec1c
 x10:               10
 x11: ffff0000006eed1c
 x12:                0
 x13:                4
 x14:               80
 x15:                1
 x16:              3a0
 x17:                3
 x18: ffff000000d976b0
 x19: ffffa00001a2a000
 x20:                2
 x21:                2
 x22:              c02
 x23:                1
 x24:         ffffffff
 x25: ffff000000967000
 x26: ffff000000967000
 x27: ffffa00001a34c70
 x28:         e5e00000
 x29: ffff000000d976b0
  sp: ffff000000d976b0
  lr: ffff0000007233b0
 elr: ffff0000006eec1c
spsr:         800001c5
 far: ffff00011ef00002
 esr:         96000021
panic: Misaligned access from kernel space!
cpuid = 0
time = 1
KDB: stack backtrace:
#0 0xffff0000004476f0 at kdb_backtrace+0x60
#1 0xffff0000003f30ac at vpanic+0x184
#2 0xffff0000003f2f24 at panic+0x44
#3 0xffff000000713674 at align_abort+0xb8
#4 0xffff0000006f3874 at handle_el1h_sync+0x74
#5 0xffff0000007233ac at layerscape_pcie_read_config+0x234
#6 0xffff000000195bf0 at pci_read_device+0xd8
#7 0xffff00000019dbc0 at pci_add_children+0x44
#8 0xffff0000001a0bec at pci_attach+0xd8
#9 0xffff00000043383c at device_attach+0x400
#10 0xffff000000434d54 at bus_generic_attach+0x4c
#11 0xffff000000723164 at layerscape_pcie_acpi_attach+0xf0
#12 0xffff00000043383c at device_attach+0x400
#13 0xffff000000435860 at bus_generic_new_pass+0x11c
#14 0xffff0000004357f0 at bus_generic_new_pass+0xac
#15 0xffff0000004357f0 at bus_generic_new_pass+0xac
#16 0xffff000000437940 at root_bus_configure+0x40
#17 0xffff00000036bec4 at mi_startup+0x11c
Uptime: 1s

I'm not finding much related to loading amdgpu on OpenBSD or NetBSD - do they even have ports for this?

I also noticed that amdgpu_device_need_post in drivers/gpu/drm/amd/amdgpu/amdgpu_device.c defaults to true - perhaps worth adding some extra DRM_INFO() lines inside there to see why it thinks it needs to post at all? I noticed when comparing the module init logs between FreeBSD and Fed34 that on the latter it detects that the GPU already posted.

valpackett commented 3 years ago

OpenBSD doesn't have loadable modules. Seems like it's just included in GENERIC kernels: https://github.com/openbsd/src/blob/233569a8f40904a0c6941ee921fa2e38a77a6c7e/sys/arch/arm64/conf/GENERIC#L130

NetBSD has.. something.. https://github.com/NetBSD/src/tree/trunk/sys/external/bsd/drm2 but I'm actually not sure if it's in working state at all.

perhaps worth adding some extra DRM_INFO() lines inside there to see why it thinks it needs to post at all

Didn't we try this (over email)? Actually probably not, I don't remember.

The failure is probably from amdgpu_atombios_scratch_need_asic_init (amdgpu_atombios.c), you can also add logging for the actual register value it reads.

agrajag9 commented 3 years ago

Yes, some quick DRM_INFO additions confirmed that's returning true.

<6>[drm] adev->asic_type >= CHIP_BONAIRE
<6>[drm] calling amdgpu_atombios_scratch_need_asic_init(adev)
<6>[drm] In drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c:
<6>[drm]        adev->bios_scratch_reg_offset      == 5c9
<6>[drm] RREG32(adev->bios_scratch_reg_offset + 7) == 0

I'm not sure what we'd expect to get here, but probably not 0?

agrajag9 commented 3 years ago

Is there a chance this is related to the issues you found with the RX 480 and EDK2 ECAM on the MACCHIATTObin? Since I know we've had to do weird ECAM stuff with this system before.

valpackett commented 3 years ago

That read on my RX 480 + mcbin returns 0x00ec030a.

Is there a chance this is related to the issues you found with the RX 480 and EDK2 ECAM on the MACCHIATTObin?

No. The thing on the mcbin (and socionext developerbox) is that the DesignWare controller doesn't filter TLPs properly, so some devices — mostly "legacy" ones — would appear duplicated, possibly into all the slots. AMD GPUs actually do their own filtering (just like devices supporting ARI, except it doesn't support ARI), so all we had to do was remove the workaround that basically only allowed legacy devices to work.

The gen4 controller used in the early rev LX2160 is a completely different controller.

https://github.com/freebsd/drm-kmod/blob/e5194b8949b316adfcda4895fc117cf1045d9336/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c#L179-L182

this is how these reads work. You could add logging right there to see what other similar reads return. Maybe also add a msleep(2) between the writel and readl.

agrajag9 commented 3 years ago

dmesg spammed with <6>[drm] In amdgpu_mm_rreg: ret == 0 :(

Current patch is looking like this:

--- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c.orig     2021-06-29 00:04:05.165929000 +0000
+++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  2021-06-29 13:16:16.977577000 +0000
@@ -178,10 +178,12 @@

                spin_lock_irqsave(&adev->mmio_idx_lock, flags);
                writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
+               msleep(2);
                ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
                spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
        }
        trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
+       DRM_INFO("In amdgpu_mm_rreg: ret == %x\n", ret);
        return ret;
 }

@@ -836,8 +838,10 @@
 {
        uint32_t reg;

-       if (amdgpu_sriov_vf(adev))
+       if (amdgpu_sriov_vf(adev)) {
+               DRM_INFO("amdgpu_sriov_vf(adev) == false\n");
                return false;
+       }

        if (amdgpu_passthrough(adev)) {
                /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
@@ -846,6 +850,7 @@
                 * vpost executed for smc version below 22.15
                 */
                if (adev->asic_type == CHIP_FIJI) {
+                       DRM_INFO("adev->asic_type == CHIP_FIJI\n");
                        int err;
                        uint32_t fw_ver;
                        err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
@@ -860,13 +865,17 @@
        }

        if (adev->has_hw_reset) {
+               DRM_INFO("adev->has_hw_reset == false\n");
                adev->has_hw_reset = false;
                return true;
        }

        /* bios scratch used on CIK+ */
-       if (adev->asic_type >= CHIP_BONAIRE)
+       if (adev->asic_type >= CHIP_BONAIRE) {
+               DRM_INFO("adev->asic_type >= CHIP_BONAIRE\n");
+               DRM_INFO("calling amdgpu_atombios_scratch_need_asic_init(adev)\n");
                return amdgpu_atombios_scratch_need_asic_init(adev);
+    }

        /* check MEM_SIZE for older asics */
        reg = amdgpu_asic_get_config_memsize(adev);
@@ -4013,8 +4022,10 @@
                        adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
                if (adev->pm.pcie_mlw_mask == 0)
                        adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
-               return;
+               // return;
        }
+
+       adev->pm.pcie_gen_mask = CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3;

        if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
                return;

agrajag9 commented 3 years ago

Added a little more to the patch and seeing some other interesting things:

# kldload -v amdgpu
anon_inodefs registered
debugfs registered
<6>[drm] amdgpu kernel modesetting enabled.
drmn0: <drmn> on vgapci0
vgapci0: child drmn0 requested pci_enable_io
vgapci0: child drmn0 requested pci_enable_io
sysctl_warn_reuse: can't re-use a leaf (hw.dri.debug)!
<6>[drm] initializing kernel modesetting (POLARIS12 0x1002:0x699F 0x1DA2:0xE367 0xC7).
<6>[drm] register mmio base: 0x40000000
<6>[drm] register mmio size: 262144
<6>[drm] add ip block number 0 <vi_common>
<6>[drm] add ip block number 1 <gmc_v8_0>
<6>[drm] add ip block number 2 <tonga_ih>
<6>[drm] add ip block number 3 <gfx_v8_0>
<6>[drm] add ip block number 4 <sdma_v3_0>
<6>[drm] add ip block number 5 <powerplay>
<6>[drm] add ip block number 6 <dm>
<6>[drm] add ip block number 7 <uvd_v6_0>
<6>[drm] add ip block number 8 <vce_v3_0>
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == fc3
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 5cb
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 5cf
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 3301
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 3348
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 1ad
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] UVD is enabled in VM mode
<6>[drm] UVD ENC is enabled in VM mode
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 1ad
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] VCE enabled in VM mode
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 1ad
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 1ad
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] adev->asic_type >= CHIP_BONAIRE
<6>[drm] calling amdgpu_atombios_scratch_need_asic_init(adev)
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 5d0
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c:
<6>[drm]        adev->bios_scratch_reg_offset      == 5c9
<6>[drm] RREG32(adev->bios_scratch_reg_offset + 7) == 0
<6>[drm] GPU posting now...
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 82
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 82

This repeats several thousand times until the eventual panic at 10s. Either we're failing to properly read the registers or we're pointed at the wrong location in memory.

I went further down the rabbithole and I'm wondering if maybe the linuxkpi pci code is doing something wrong here? https://github.com/freebsd/drm-kmod/blob/b45715c9a147c06fdc1c298cbcb52e1ebf9f81f3/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c#L2684-L2686

valpackett commented 3 years ago

WAIT WAIT WAIT.

nvme0@pci2:1:0:0:       class=0x010802 rev=0x00 hdr=0x00 vendor=0x144d device=0xa808 subvendor=0x144d subdevice=0xa801
    bar   [10] = type Memory, range 64, base 0x40000000, size 16384, enabled
vgapci0@pci4:1:0:0:     class=0x030000 rev=0xc7 hdr=0x00 vendor=0x1002 device=0x699f subvendor=0x1da2 subdevice=0xe367
    bar   [24] = type Memory, range 32, base 0x40000000, size 262144, enabled

Is this.. supposed to happen.. or are these host physical addresses and did both PCIe controllers map their devices into the same address??

Just in case I'm not completely stupid, please test without an NVMe drive, using SATA or USB for the system disk.

agrajag9 commented 3 years ago

Removed the NVMe and booted from USB. It still tries to post and panics, but it looks like it might be at least accessing different registers eventually?

<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 83
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == e
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == f
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == e
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == e
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
...
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 16f5
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 16f4
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 16f4
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
<6>[drm] In amdgpu_mm_rreg:
<6>[drm] (reg * 4) < adev->rmmio_size
<6>[drm] reg == 16fb
<6>[drm] adev->rmmio_size == 40000
<6>[drm] ret == 0
[drm ERROR :atom_op_jump] atombios stuck in loop for more than 10secs aborting
[drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing CA56 (len 130, WS 0, PS 0) @ 0xCA75
[drm ERROR :amdgpu_atom_execute_table_locked] atombios stuck executing A984 (len 158, WS 0, PS 8) @ 0xA9B9
drmn0: gpu post error!

agrajag9 commented 3 years ago

Well, that's curious... https://gist.github.com/agrajag9/6986f05cd70baa774384fe8249aa356e#file-honeycomb-aml-L4604 and https://gist.github.com/agrajag9/6986f05cd70baa774384fe8249aa356e#file-honeycomb-aml-L4847 Looks like that's where we're getting our base address and it's the same for both busses.

valpackett commented 3 years ago

That's range minimum, but they have different translation offset. In pciconf output we only see that offset applied in "Prefetchable Memory" BARs but not ones marked just "Memory". So might not be an issue after all (?) I really don't know at this point, it just looks suspicious.

agrajag9 commented 3 years ago

This is interesting...

https://gist.github.com/agrajag9/cfc0a6887a8a001d0b35335ba4de0400

Starts with

vgapci0: In sys/compat/linuxkpi/common/src/linux_pci.c:
vgapci0: rle->type  == 0x3
vgapci0: rle->rid   == 0x10
vgapci0: rle->flags == 0x1
vgapci0: rle->start == 0xa400000000
vgapci0: rle->end   == 0xa40fffffff
vgapci0: rle->count == 0x10000000

but then almost immediately after:

vgapci0: In sys/compat/linuxkpi/common/src/linux_pci.c:
vgapci0: rle->type  == 0x3
vgapci0: rle->rid   == 0x24
vgapci0: rle->flags == 0x1
vgapci0: rle->start == 0x40000000
vgapci0: rle->end   == 0x4003ffff
vgapci0: rle->count == 0x40000

From pciconf -lvbc:

    bar   [10] = type Prefetchable Memory, range 64, base 0xa400000000, size 268435456, enabled
    bar   [18] = type Prefetchable Memory, range 64, base 0xa410000000, size 2097152, enabled
    bar   [20] = type I/O Port, range 32, base 0, size 256, disabled
    bar   [24] = type Memory, range 32, base 0x40000000, size 262144, enabled

valpackett commented 3 years ago

You've replicated pciconf with log statements :) Look at the rle->rid, the first output matches [10], the second is for [24].

BTW, full info about the BARs is found at https://rocmdocs.amd.com/en/latest/GCN_ISA_Manuals/PCIe-features.html#bar-memory-overview — so the [10] is VRAM, [18] is doorbell, [24] is of course the configuration registers that all return 0 on reads for you…

Looking at the Linux dmesg again

[    1.774856] pci 0002:01:00.0: BAR 0: assigned [mem 0x9400000000-0x9400003fff 64bit]

[    1.993408] pci 0004:01:00.0: BAR 0: assigned [mem 0xa400000000-0xa40fffffff 64bit pref]
[    2.001497] pci 0004:01:00.0: BAR 2: assigned [mem 0xa410000000-0xa4101fffff 64bit pref]
[    2.009580] pci 0004:01:00.0: BAR 5: assigned [mem 0xa040000000-0xa04003ffff]
[    2.016707] pci 0004:01:00.0: BAR 6: assigned [mem 0xa040040000-0xa04005ffff pref]
[    2.024264] pci 0004:01:00.1: BAR 0: assigned [mem 0xa410200000-0xa410203fff 64bit]
[    2.031916] pci 0004:01:00.0: BAR 4: assigned [io  0x10000-0x100ff]

so, we are supposed to assign 0xa040000000 to this BAR, and 0x9400000000 to the NVMe drive's only one, otherwise they collide. Looks like this is the bug after all.

valpackett commented 3 years ago

Potentially useful logging:

diff --git i/sys/dev/pci/pci_host_generic.c w/sys/dev/pci/pci_host_generic.c
index 0c45f5d316e..3f999d86c5b 100644
--- i/sys/dev/pci/pci_host_generic.c
+++ w/sys/dev/pci/pci_host_generic.c
@@ -345,6 +345,7 @@ generic_pcie_translate_resource(device_t dev, int type, rman_res_t start,
            phys_base = sc->ranges[i].phys_base;
            size = sc->ranges[i].size;

+           device_printf(dev, "translate: start %lx pci_base %lx phys_base %lx size %x\n", start, pci_base, phys_base, size);
            if (start < pci_base || start >= pci_base + size)
                continue;

@@ -364,6 +365,7 @@ generic_pcie_translate_resource(device_t dev, int type, rman_res_t start,
            if (type == space) {
                *new_start = start - pci_base + phys_base;
                *new_end = end - pci_base + phys_base;
+               device_printf(dev, "translate: new start %lx end %lx\n", new_start, new_end);
                found = true;
                break;
            }
@@ -412,6 +414,10 @@ pci_host_generic_core_alloc_resource(device_t dev, device_t child, int type,
            device_get_nameunit(child));
        return (NULL);
    }
+   device_printf(dev,
+       "translated resource %jx-%jx type %x for %s to %lx-%lx\n",
+       (uintmax_t)start, (uintmax_t)end, type,
+       device_get_nameunit(child), phys_start, phys_end);

    if (bootverbose) {
        device_printf(dev,
@@ -456,9 +462,14 @@ generic_pcie_activate_resource(device_t dev, device_t child, int type,

    start = rman_get_start(r);
    end = rman_get_end(r);
+   rman_res_t ostart = start, oend = end;
    if (!generic_pcie_translate_resource(dev, type, start, end, &start,
        &end))
        return (EINVAL);
+   device_printf(dev,
+       "activate:translated resource %jx-%jx type %x for %s to %lx-%lx\n",
+       (uintmax_t)ostart, (uintmax_t)oend, type,
+       device_get_nameunit(child), start, end);
    rman_set_start(r, start);
    rman_set_end(r, end);

diff --git i/sys/dev/pci/pci_host_generic_acpi.c w/sys/dev/pci/pci_host_generic_acpi.c
index 763a84d2fd5..d16d614f5b1 100644
--- i/sys/dev/pci/pci_host_generic_acpi.c
+++ w/sys/dev/pci/pci_host_generic_acpi.c
@@ -157,6 +157,7 @@ pci_host_generic_acpi_parse_resource(ACPI_RESOURCE *res, void *arg)
        res->Data.Address.ResourceType == ACPI_IO_RANGE) {
        sc->base.ranges[r].pci_base = min;
        sc->base.ranges[r].phys_base = min + off;
+       device_printf(dev, "ACPIPCI-parse range %d pci_base %lx phys_base %lx\n", r, min, min + off);
        sc->base.ranges[r].size = max - min + 1;
        if (res->Data.Address.ResourceType == ACPI_MEMORY_RANGE)
            sc->base.ranges[r].flags |= FLAG_TYPE_MEM;

Not tested so you'd have to fix the errors if there are any.

agrajag9 commented 3 years ago

No change to the output when loading the module, but dmesg here: https://gist.github.com/agrajag9/be5c9c58b91497923ae9512dac32f0d3

pcib0: <Generic PCI host controller> on acpi0
pcib0: ACPIPCI-parse range 0 pci_base 40000000 phys_base 9040000000
pcib0: ACPIPCI-parse range 1 pci_base 9400000000 phys_base 9400000000
pcib0: ACPIPCI-parse range 2 pci_base 0 phys_base 9010000000
pcib0: Bus is cache-coherent
pcib0: ECAM for bus 1-255 at mem 9000100000-900fffffff

pcib1: <Generic PCI host controller> on acpi0
pcib1: ACPIPCI-parse range 0 pci_base 40000000 phys_base a040000000
pcib1: ACPIPCI-parse range 1 pci_base a400000000 phys_base a400000000
pcib1: ACPIPCI-parse range 2 pci_base 0 phys_base a010000000
pcib1: Bus is cache-coherent
pcib1: ECAM for bus 1-255 at mem a000100000-a00fffffff
pcib1: translate: start a400000000 pci_base 40000000 phys_base a040000000 size c0000000
pcib1: translate: start a400000000 pci_base a400000000 phys_base a400000000 size 400000000
pcib1: translate: new start a400000000 end a40fffffff
pcib1: translated resource a400000000-a40fffffff type 3 for (null) to a400000000-a40fffffff
pcib1: rman_reserve_resource: start=0xa400000000, end=0xa40fffffff, count=0x10000000
pcib1: translate: start a410000000 pci_base 40000000 phys_base a040000000 size c0000000
pcib1: translate: start a410000000 pci_base a400000000 phys_base a400000000 size 400000000
pcib1: translate: new start a410000000 end a4101fffff
pcib1: translated resource a410000000-a4101fffff type 3 for (null) to a410000000-a4101fffff
pcib1: rman_reserve_resource: start=0xa410000000, end=0xa4101fffff, count=0x200000
pcib1: translate: start 40000000 pci_base 40000000 phys_base a040000000 size c0000000
pcib1: translate: new start a040000000 end a04003ffff
pcib1: translated resource 40000000-4003ffff type 3 for (null) to a040000000-a04003ffff
pcib1: rman_reserve_resource: start=0x40000000, end=0x4003ffff, count=0x40000
pcib1: translate: start 40040000 pci_base 40000000 phys_base a040000000 size c0000000
pcib1: translate: new start a040040000 end a040043fff
pcib1: translated resource 40040000-40043fff type 3 for (null) to a040040000-a040043fff
pcib1: rman_reserve_resource: start=0x40040000, end=0x40043fff, count=0x4000

Also had to change a few things in your patch to make it work right, also in that gist.

agrajag9 commented 3 years ago

Interestingly I think we only see this for pcib1. There are no translation lines for pcib0, when I'm pretty sure there should be.

valpackett commented 3 years ago

Oh, you should've loaded amdgpu in that dmesg, the activate:translated line is for that case. The translated addresses are actually set on the resource only on activation. Which, hmm, 1) why? and 2) maybe the activation is just not happening..somehow?

In any case,

diff --git i/sys/dev/pci/pci_host_generic.c w/sys/dev/pci/pci_host_generic.c
index 0c45f5d316e..99927487e29 100644
--- i/sys/dev/pci/pci_host_generic.c
+++ w/sys/dev/pci/pci_host_generic.c
@@ -419,7 +419,7 @@ pci_host_generic_core_alloc_resource(device_t dev, device_t child, int type,
            start, end, count);
    }

-   res = rman_reserve_resource(rm, start, end, count, flags, child);
+   res = rman_reserve_resource(rm, phys_start, phys_end, count, flags, child);
    if (res == NULL)
        goto fail;

try pciconf -lvbc with this patch and then load amdgpu

agrajag9 commented 3 years ago

New dmesg: https://gist.github.com/agrajag9/7a46de7807c43a8bea3c876727d85820

Except now it panics WAY faster:

# kldload -v amdgpu
anon_inodefs registered
debugfs registered
<6>[drm] amdgpu kernel modesetting enabled.
drmn0: <drmn> on vgapci0
vgapci0: In sys/compat/linuxkpi/common/src/linux_pci.c:
vgapci0: rle->type  == 0x3
vgapci0: rle->rid   == 0x10
vgapci0: rle->flags == 0x1
vgapci0: rle->start == 0xa400000000
vgapci0: rle->end   == 0xa40fffffff
vgapci0: rle->count == 0x10000000
vgapci0: child drmn0 requested pci_enable_io
vgapci0: child drmn0 requested pci_enable_io
sysctl_warn_reuse: can't re-use a leaf (hw.dri.debug)!
<6>[drm] initializing kernel modesetting (POLARIS12 0x1002:0x699F 0x1DA2:0xE367 0xC7).
panic: Assertion size > 0 failed at /usr/src/sys/kern/subr_vmem.c:1332
cpuid = 0
time = 1625091021
KDB: stack backtrace:
db_trace_self() at db_trace_self
db_trace_self_wrapper() at db_trace_self_wrapper+0x30
vpanic() at vpanic+0x184
panic() at panic+0x44
vmem_alloc() at vmem_alloc+0x104
kva_alloc() at kva_alloc+0x28
pmap_mapdev_attr() at pmap_mapdev_attr+0xbc
_ioremap_attr() at _ioremap_attr+0x10
amdgpu_device_init() at amdgpu_device_init+0x8fc
amdgpu_driver_load_kms() at amdgpu_driver_load_kms+0x48
drm_dev_register() at drm_dev_register+0xcc
amdgpu_pci_probe() at amdgpu_pci_probe+0x210
linux_pci_attach_device() at linux_pci_attach_device+0x294
device_attach() at device_attach+0x400
device_probe_and_attach() at device_probe_and_attach+0x7c
bus_generic_driver_added() at bus_generic_driver_added+0x74
devclass_driver_added() at devclass_driver_added+0x44
devclass_add_driver() at devclass_add_driver+0x140
_linux_pci_register_driver() at _linux_pci_register_driver+0xc8
amdgpu_evh() at amdgpu_evh+0xb4
module_register_init() at module_register_init+0xc4
linker_load_module() at linker_load_module+0xb2c
kern_kldload() at kern_kldload+0x15c
sys_kldload() at sys_kldload+0x64
do_el0_sync() at do_el0_sync+0x4a0
handle_el0_sync() at handle_el0_sync+0x90
--- exception, esr 0x56000000
KDB: enter: panic
[ thread pid 65343 tid 100525 ]
Stopped at      kdb_enter+0x44: undefined       f904411f
db>

valpackett commented 3 years ago

pci_host_generic_core_alloc_resource FAIL

oh.. okay. So it must be done this way for a reason.

BTW:

pci0: on pcib0

I'm not sure if there's any harm caused by this (quite possibly none) but I've noticed https://reviews.freebsd.org/D30953 has appeared recently to fix this

valpackett commented 3 years ago

Okay now I think I see it.

LinuxKPI uses BUS_TRANSLATE_RESOURCE to actually get the physical address.

pci_host_generic does not implement it. Only ofw_pcib currently does in the whole tree.

Because of that, LinuxKPI returns the PCI address instead of the translated physical address to the driver.

diff --git i/sys/dev/pci/pci_host_generic.c w/sys/dev/pci/pci_host_generic.c
index 0c45f5d316e..6694da9d43c 100644
--- i/sys/dev/pci/pci_host_generic.c
+++ w/sys/dev/pci/pci_host_generic.c
@@ -324,7 +324,7 @@ pci_host_generic_core_release_resource(device_t dev, device_t child, int type,
 }

 static bool
-generic_pcie_translate_resource(device_t dev, int type, rman_res_t start,
+generic_pcie_translate_resource_end(device_t dev, int type, rman_res_t start,
     rman_res_t end, rman_res_t *new_start, rman_res_t *new_end)
 {
    struct generic_pcie_core_softc *sc;
@@ -380,6 +380,16 @@ generic_pcie_translate_resource(device_t dev, int type, rman_res_t start,
    return (found);
 }

+static int
+generic_pcie_translate_resource(device_t bus, int type,
+    rman_res_t start, rman_res_t *newstart)
+{
+   rman_res_t newend; /* unused */
+
+   return (generic_pcie_translate_resource_end(
+       bus, type, start, 0, newstart, &newend));
+}
+
 struct resource *
 pci_host_generic_core_alloc_resource(device_t dev, device_t child, int type,
     int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
@@ -404,7 +414,7 @@ pci_host_generic_core_alloc_resource(device_t dev, device_t child, int type,
            type, rid, start, end, count, flags));

    /* Translate the address from a PCI address to a physical address */
-   if (!generic_pcie_translate_resource(dev, type, start, end, &phys_start,
+   if (!generic_pcie_translate_resource_end(dev, type, start, end, &phys_start,
        &phys_end)) {
        device_printf(dev,
            "Failed to translate resource %jx-%jx type %x for %s\n",
@@ -456,7 +466,7 @@ generic_pcie_activate_resource(device_t dev, device_t child, int type,

    start = rman_get_start(r);
    end = rman_get_end(r);
-   if (!generic_pcie_translate_resource(dev, type, start, end, &start,
+   if (!generic_pcie_translate_resource_end(dev, type, start, end, &start,
        &end))
        return (EINVAL);
    rman_set_start(r, start);
@@ -527,6 +537,7 @@ static device_method_t generic_pcie_methods[] = {
    DEVMETHOD(bus_activate_resource,    generic_pcie_activate_resource),
    DEVMETHOD(bus_deactivate_resource,  generic_pcie_deactivate_resource),
    DEVMETHOD(bus_release_resource,     pci_host_generic_core_release_resource),
+   DEVMETHOD(bus_translate_resource,   generic_pcie_translate_resource),
    DEVMETHOD(bus_setup_intr,       bus_generic_setup_intr),
    DEVMETHOD(bus_teardown_intr,        bus_generic_teardown_intr),

freebsd / drm-kmod

RX570/POLARIS12 panic during GPU post on 13/stable aarch64 #84