LibreQoE / LibreQoS

A Quality of Experience and Smart Queue Management system for ISPs. Leverage CAKE to improve network responsiveness, enforce bandwidth plans, and reduce bufferbloat.
https://libreqos.io/
GNU General Public License v2.0
414 stars 46 forks source link

v1.5-beta: problem with reloading #518

Open interduo opened 2 months ago

interduo commented 2 months ago

I installed LibreQoS Running ./LibreQoS.py gave me hang at: image

checking deeper: image

Running ./LibreQoS.py --debug shows that:

MQ detected. Will delete and recreate mq qdisc.
INFO:root:tc qdisc delete dev ens17np0 root
INFO:root:tc qdisc delete dev ens16np0 root

[here is hanging]

strace -p 1677

strace: Process 1677 attached
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661542272, u64=128040931603328}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660354176, u64=128040930415232}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660354432, u64=128040930415488}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660355200, u64=128040930416256}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660355456, u64=128040930416512}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 83
epoll_ctl(5, EPOLL_CTL_ADD, 83, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660355712, u64=128040930416768}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660355968, u64=128040930417024}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660356736, u64=128040930417792}}) = 0
futex(0x7473caa00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660357248, u64=128040930418304}}) = 0
futex(0x7473caa00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660357504, u64=128040930418560}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661439616, u64=128040931500672}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661442944, u64=128040931504000}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661456512, u64=128040931517568}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661457024, u64=128040931518080}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661457280, u64=128040931518336}}) = 0
futex(0x7473caa00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661459072, u64=128040931520128}}) = 0
futex(0x7473caa00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661460864, u64=128040931521920}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660354944, u64=128040930416000}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660356224, u64=128040930417280}}) = 0
futex(0x7473cae00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661439872, u64=128040931500928}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661441152, u64=128040931502208}}) = 0
futex(0x7473caa00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661442688, u64=128040931503744}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 83
epoll_ctl(5, EPOLL_CTL_ADD, 83, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661459328, u64=128040931520384}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 83
epoll_ctl(5, EPOLL_CTL_ADD, 83, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661459584, u64=128040931520640}}) = 0
futex(0x7473c9c00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 84
epoll_ctl(5, EPOLL_CTL_ADD, 84, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661461376, u64=128040931522432}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661461632, u64=128040931522688}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661462144, u64=128040931523200}}) = 0
futex(0x7473cb200598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661463424, u64=128040931524480}}) = 0
futex(0x7473cb200598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661541504, u64=128040931602560}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661542016, u64=128040931603072}}) = 0
futex(0x7473c9c00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661542528, u64=128040931603584}}) = 0
futex(0x7473c9c00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661542784, u64=128040931603840}}) = 0
futex(0x7473c9c00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661543296, u64=128040931604352}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660354176, u64=128040930415232}}) = 0
futex(0x7473cb200598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660354688, u64=128040930415744}}) = 0
futex(0x7473cb200598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 82
epoll_ctl(5, EPOLL_CTL_ADD, 82, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3660356992, u64=128040930418048}}) = 0
write(4, "\1\0\0\0\0\0\0\0", 8)         = 8
accept4(65, 0x7fff1f362580, [110], SOCK_CLOEXEC|SOCK_NONBLOCK) = -1 EAGAIN (Resource temporarily unavailable)
futex(0x7473db939258, FUTEX_WAIT_PRIVATE, 1, NULL) = 0
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661439360, u64=128040931500416}}) = 0
futex(0x7473c9c00598, FUTEX_WAKE_PRIVATE, 1) = 1
accept4(65, {sa_family=AF_UNIX}, [110 => 2], SOCK_CLOEXEC|SOCK_NONBLOCK) = 79
epoll_ctl(5, EPOLL_CTL_ADD, 79, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=3661440896, u64=128040931501952}}) = 0

If I get out network traffic out of LibreQoS network interfaces (showdown vlan facing to the internet) it continue to reload and I could see: image

interduo commented 2 months ago

The network traffic is going on but QoS policies are not applied to network traffic.

root@libreqos-beta:~# uname -a
Linux libreqos-beta 6.8.0-38-generic #38-Ubuntu SMP PREEMPT_DYNAMIC Fri Jun  7 15:25:01 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux
00:10.0 Ethernet controller: Mellanox Technologies MT27700 Family [ConnectX-4]
    Subsystem: Mellanox Technologies MT27700 Family [ConnectX-4]
00:11.0 Ethernet controller: Mellanox Technologies MT27700 Family [ConnectX-4]
    Subsystem: Mellanox Technologies MT27700 Family [ConnectX-4]

NICs was tested on older LibreQoS versions on ubuntu 22.04 it was working like a charm.

First look: root@libreqos-beta:~# tc class show dev ens16np0 | wc -l 5693 root@libreqos-beta:~# tc class show dev ens17np0 | wc -l 5581

thebracket commented 2 months ago

I actually saw something like this for the first time last night. A LibreQoS.py run took 14 minutes to complete, where it used to be maybe a second. I wasn't in a setup where I could do much real diagnosis, so I'll be trying to recreate it and figure out what's getting snarled up. I'm not sure if the lqosd trace is all that useful - because it uses Futexes a lot (they are the base behind Rust's Mutex type, and hard to avoid for any kind of concurrent setup).

It doesn't want to recreate on my local setup, which is going to make this a harder one to debug.

interduo commented 2 months ago

I can give You my proxmox VM backup if You suspects that I could something do wrong. The most interesting thing is that if i redirect network traffic the reloading go sucessfully in a while. This bug doesn't occur on libreqos without network traffic passing.

thebracket commented 2 months ago

I wonder if ProxMox is the common factor here? Mine was also in ProxMox, passing about 1gbps at the time. It did eventually complete. I'll dive into this as soon as the coffee has done something.

interduo commented 2 months ago

I was carrying ~6gbps sumarized network throughput on iface when run ./LibreQoS.py from console.

thebracket commented 2 months ago

This one is definitely going to be tricky. It's early morning and our traffic is pretty low (~ 400 mbps) and it ran without hiccups on the live box. (It also ran on my local system with about a gigabit of iperf traffic being forced through it).

From timing the parts, it seemed like the longest delays were in:

Executing XDP-CPUMAP-TC IP filter commands
Executed 1281 XDP-CPUMAP-TC IP filter commands

(Not terrible, but enough that I was surprised to see it waiting - it didn't used to slow down there). Will investigate further.

Update: Running it again shows that there's a really big delay there that didn't used to be there. So at least now I have a candidate to examine.

thebracket commented 2 months ago

I've identified the issue. The "hot cache" was being invalidated after every single IP mapping change, rather than once at the end (you have to invalidate it for changes to appear). So I'm in the process of changing the workflow slightly to explicitly flush at the end. My local test (hacked together rather than nice, shareable code) saw a MASSIVE improvement in reload times doing this.

interduo commented 2 months ago

Did You try to load 50K circuits? Maybe there is more places to improve?

interduo commented 2 months ago

Its not so important. More important is "no packet loss during reload".

interduo commented 2 months ago

My bet would be you need to run LibreQoS.py and check the output?

I did it. Nothing strange see then - It even creates tc classes and tc qdisc. Ok - i would do "round two" on clean OS from ISO and using newly added fixes in develop branch.

interduo commented 2 months ago

Tested - now it doesnt hang. XDP filters: 0.0256 seconds

But I have a message:

Jul 12 08:40:25 libreqos-beta python3[13342]: /opt/libreqos/src/scheduler.py:62: UserWarning: Some devices were not shaped. Please check to ensure they have a valid ParentNode list>
Jul 12 08:40:25 libreqos-beta python3[13342]:   refreshShapers()

What I can do about it? I got a flat network architecture.

thebracket commented 2 months ago

Double check that you didn't put anything in a parent node that shouldn't be there; I'll be glad to take a look otherwise (I have a "flat" test setup, but don't touch it often - none of my networks are even remotely flat!). If you want, fire up the lqos_support_tool and send a support dump. (Sorry for the edits, my spelling is bad this morning)

interduo commented 2 months ago

Double check that you didn't put anything in a parent node that shouldn't be there;

I don't understand.

root@libreqos-beta:/opt/libreqos/src# cat network.json 
{}

I submited dump with lqos_support_tool. Edit as much as You want if message is last in issue.

interduo commented 2 months ago

UserWarning: Some devices were not shaped. this message is saying too less Suggestion: present line of ShapedDevices.csv and/or CircuitID+DeviceID

thebracket commented 2 months ago

Thanks for the support dump (I love that new tool!). I don't see anything jumping out in the Shaped Devices list - so I'm going to assume that there's a bug to chase down in the flat network handler in LibreQoS.py. I'll hammer on that later today. Thanks!

(I'm assuming "KOMENTARZ" means comment?)

interduo commented 2 months ago

Yes - KOMENTARZ means comment, on production this is replaced with some circuit description/identity

interduo commented 2 months ago

Got bad news. This is unfixed even with https://github.com/LibreQoE/LibreQoS/pull/520 It stops reloading in the same moment.

interduo commented 1 month ago

UserWarning: Some devices were not shaped. this message is saying too less Suggestion: present line of ShapedDevices.csv and/or CircuitID+DeviceID

@thebracket could You add more info to this error?

interduo commented 1 month ago

Checked again - this bug exists also in newly released -beta2

Queue and IP filter reload completed in 71.3 seconds
    TC commands:    5.9 seconds
    XDP setup:   61.9 seconds
    XDP filters:     0.2948 seconds
refreshShapers completed on 02/08/2024 14:33:43

When i stop passing network traffic it ends reloading very quickly.

interduo commented 1 month ago
Aug 05 10:12:55 libreqos-beta lqosd[985]: [2024-08-05T08:12:55Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:12:56 libreqos-beta lqosd[985]: [2024-08-05T08:12:56Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:12:57 libreqos-beta lqosd[985]: [2024-08-05T08:12:57Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:12:58 libreqos-beta lqosd[985]: [2024-08-05T08:12:58Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:12:59 libreqos-beta lqosd[985]: [2024-08-05T08:12:59Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:00 libreqos-beta lqosd[985]: [2024-08-05T08:13:00Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:01 libreqos-beta lqosd[985]: [2024-08-05T08:13:01Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:02 libreqos-beta lqosd[985]: [2024-08-05T08:13:02Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:03 libreqos-beta lqosd[985]: [2024-08-05T08:13:03Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:04 libreqos-beta lqosd[985]: [2024-08-05T08:13:04Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:05 libreqos-beta lqosd[985]: [2024-08-05T08:13:05Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:06 libreqos-beta lqosd[985]: [2024-08-05T08:13:06Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:07 libreqos-beta lqosd[985]: [2024-08-05T08:13:07Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:08 libreqos-beta lqosd[985]: [2024-08-05T08:13:08Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:09 libreqos-beta lqosd[985]: [2024-08-05T08:13:09Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:10 libreqos-beta lqosd[985]: [2024-08-05T08:13:10Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:11 libreqos-beta lqosd[985]: [2024-08-05T08:13:11Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:12 libreqos-beta lqosd[985]: [2024-08-05T08:13:12Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast
Aug 05 10:13:13 libreqos-beta lqosd[985]: [2024-08-05T08:13:13Z WARN  lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast

I found that in my logs

thebracket commented 1 month ago

That's been there since lqosd existed - it just means the queues haven't been made yet, and there's nothing useful to read from a pfifo queue that's there by default.

Is the "hangs" still an issue?

On Mon, Aug 5, 2024, 3:15 AM Jarosław Kłopotek - INTERDUO < @.***> wrote:

Aug 05 10:12:55 libreqos-beta lqosd[985]: [2024-08-05T08:12:55Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:12:56 libreqos-beta lqosd[985]: [2024-08-05T08:12:56Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:12:57 libreqos-beta lqosd[985]: [2024-08-05T08:12:57Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:12:58 libreqos-beta lqosd[985]: [2024-08-05T08:12:58Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:12:59 libreqos-beta lqosd[985]: [2024-08-05T08:12:59Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:00 libreqos-beta lqosd[985]: [2024-08-05T08:13:00Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:01 libreqos-beta lqosd[985]: [2024-08-05T08:13:01Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:02 libreqos-beta lqosd[985]: [2024-08-05T08:13:02Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:03 libreqos-beta lqosd[985]: [2024-08-05T08:13:03Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:04 libreqos-beta lqosd[985]: [2024-08-05T08:13:04Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:05 libreqos-beta lqosd[985]: [2024-08-05T08:13:05Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:06 libreqos-beta lqosd[985]: [2024-08-05T08:13:06Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:07 libreqos-beta lqosd[985]: [2024-08-05T08:13:07Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:08 libreqos-beta lqosd[985]: [2024-08-05T08:13:08Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:09 libreqos-beta lqosd[985]: [2024-08-05T08:13:09Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:10 libreqos-beta lqosd[985]: [2024-08-05T08:13:10Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:11 libreqos-beta lqosd[985]: [2024-08-05T08:13:11Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:12 libreqos-beta lqosd[985]: [2024-08-05T08:13:12Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast Aug 05 10:13:13 libreqos-beta lqosd[985]: [2024-08-05T08:13:13Z WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast

I found that in my logs

— Reply to this email directly, view it on GitHub https://github.com/LibreQoE/LibreQoS/issues/518#issuecomment-2268449764, or unsubscribe https://github.com/notifications/unsubscribe-auth/ADRU432BGFVYU7EK4URDOKTZP4YALAVCNFSM6AAAAABKW3I4IOVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDENRYGQ2DSNZWGQ . You are receiving this because you were mentioned.Message ID: @.***>

interduo commented 1 month ago

Yes, but only when big traffic is going through libreqos

interduo commented 1 month ago

That's been there since lqosd existed - it just means the queues haven't been made yet, and there's nothing useful to read from a pfifo queue that's there by default.

No - in v1.4 I dont have such warning msg and there was lqosd. Dont know how to parse for me means "script dont understand" - maybe we should precise msg a little?

Tried to reload without traffic - goes ok but the traffic is not shaped.

I am really confused and don't know what to check more.

interduo commented 1 month ago

@rchac I recorded video: http://kłopotek.pl/lqos/screen-recording-06082024.mp4

interduo commented 1 month ago

That's been there since lqosd existed - it just means the queues haven't been made yet, and there's nothing useful to read from a pfifo queue that's there by default.

No - in v1.4 I dont have such warning msg and there was lqosd. Dont know how to parse for me means "script dont understand" - maybe we should precise msg a little?

Tried to reload without traffic - goes ok but the traffic is not shaped.

I am really confused and don't know what to check more.

One problem is out (reloading during big network traffic) after https://github.com/LibreQoE/LibreQoS/pull/545

Second one is still an issue (no traffic shaping and message WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast).

I got this message every reload in my journalctl -u lqosd.

thebracket commented 1 month ago

For the 1,000,000,000th time, that warning message isn't a bug.

On Mon, Aug 12, 2024, 6:42 AM Jarosław Kłopotek - INTERDUO < @.***> wrote:

That's been there since lqosd existed - it just means the queues haven't been made yet, and there's nothing useful to read from a pfifo queue that's there by default.

No - in v1.4 I dont have such warning msg and there was lqosd. Dont know how to parse for me means "script dont understand" - maybe we should precise msg a little?

Tried to reload without traffic - goes ok but the traffic is not shaped.

I am really confused and don't know what to check more.

One problem is out (reloading during big network traffic) after #545 https://github.com/LibreQoE/LibreQoS/pull/545

Second one is still an issue (no traffic shaping and message "WARN lqos_queue_tracker::queue_types] I don't know how to parse qdisc type pfifo_fast").

— Reply to this email directly, view it on GitHub https://github.com/LibreQoE/LibreQoS/issues/518#issuecomment-2283744522, or unsubscribe https://github.com/notifications/unsubscribe-auth/ADRU4367UPJQ6H6HV4MIFI3ZRCNRHAVCNFSM6AAAAABKW3I4IOVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOBTG42DINJSGI . You are receiving this because you were mentioned.Message ID: @.***>

interduo commented 1 month ago

OK - but checked again and in v1.4 (on working VM) there wasn't such warn like You said ealier.

What to check then?

thebracket commented 1 month ago

The reason you're seeing that message is that when it polls the queues, it's finding pfifo and not Cake - so the message isn't the issue. The question is, why don't you have any queues?

I'd start by going into your config, and changing this line back to 0, the default (or removing it):

override_available_queues = 26 # This can be omitted and be 0 for Python

On Mon, Aug 12, 2024 at 7:03 AM Jarosław Kłopotek - INTERDUO < @.***> wrote:

OK - but checked again and in v1.4 (on working VM) there wasn't such warn like You said ealier.

What to check then?

— Reply to this email directly, view it on GitHub https://github.com/LibreQoE/LibreQoS/issues/518#issuecomment-2283785055, or unsubscribe https://github.com/notifications/unsubscribe-auth/ADRU432U6GXRSSDFQSEBN2DZRCQBFAVCNFSM6AAAAABKW3I4IOVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOBTG44DKMBVGU . You are receiving this because you were mentioned.Message ID: @.***>

interduo commented 1 month ago

I got queues loaded (qdisc and classes are on iface) but it looks like the xdp filters are not working. How to list them?

interduo commented 1 month ago

I:

Testing and I get much packet loss :(

I get some data to diagnose: http://kłopotek.pl/lqos_beta_problem/

HW is good on second VM (with v1.4 and ubuntu 22.04) i use the same passthrough NICs and it works well.

The maximum network throughtput I could do with libre v1.5-beta2 was: Zrzut ekranu z 2024-08-13 12-47-32

On almost empty VM the lqosd takes 26% of CPU core - is it normal thing? image

interduo commented 1 month ago

Tried to set monitor_only = true - less packet loss but 50% of network throughtput was avaiable. Maybe there is something wrong with XDP bridge?

I will check tommorow if those problems exists in older ubuntu LTS (22.04) with newest LibreQoS.

interduo commented 1 month ago

I tried to install develop branch on ubuntu 22.04.

On older ubuntu 22.04 there was: ens19: <BROADCAST,MULTICAST> mtu 1500 qdisc noop On newer 24.04: ens19: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000

When compiling:

Compiling tower-http v0.5.2
warning: lqos_sys@0.1.0: [bpf skel] libbpf: elf: skipping unrecognized data section(11) .rodata.str1.1

Is it important?

Tried to run lqosd:

root@libreqos-beta-oldubuntu:/opt/libreqos/src/bin# ./lqosd
Error: Unable to load the XDP/TC kernel (-13)

I am trying to run lqosd on VirtioNIC's on older Ubuntu 22.04 for testing then test if it works on passthrough NICs.

Strace: http://kłopotek.pl/lqos_beta_problem/strace_loading_lqosd_on_olderubuntu

thebracket commented 1 month ago

warning: lqos_sys@0.1.0: [bpf skel] libbpf: elf: skipping unrecognized data section(11) .rodata.str1.1

Is harmless. I just can't stop Linux from emitting it.

Do you have the hot-cache PR applied? I wonder if I exceeded the older instruction limit (the intent was not to require the newer kernel). I'm still hoping for a better solution than the one in that PR (which is why I haven't merged it).

interduo commented 1 month ago

I tested on develop + patch-33 + cherry-pick commit from #545

testbed1: ubuntu 24.04 testbed2: ubuntu 22.04

interduo commented 1 month ago

Is harmless. I just can't stop Linux from emitting it.

Ok - on ubuntu 24.04 there are no warns during compilation.

thebracket commented 1 month ago

Remind me - Patch-33?

interduo commented 1 month ago

https://github.com/LibreQoE/LibreQoS/pull/505 patch-33 is branch name

interduo commented 1 month ago

Tested again with PR https://github.com/LibreQoE/LibreQoS/pull/547/commits. Reloading is OK (not hanging). If You merge PR #547 please close that issue I open another for next and maybe last problem.

interduo commented 1 month ago

Did You come back @thebracket?

thebracket commented 1 month ago

I did, and straight to the world of the unwell (Daughter got a stomach bug, now I'm out with it)

interduo commented 1 month ago

I hope She is better now. Just give a ping when You got time.

interduo commented 3 weeks ago

I don't know is it related to this bug so I created next issue https://github.com/LibreQoE/LibreQoS/issues/549.