Closed systemcrash closed 5 years ago
Thanks for the report. I am able to reproduce it as well. Will investigate
Something to do with the socket file - trace reveals:
root@darkstar:/# strace nlbw -c json
execve("/usr/sbin/nlbw", ["nlbw", "-c", "json"], [/* 13 vars */]) = 0
set_thread_area(0x776b7d48) = 0
set_tid_address(0x776b0cac) = 11513
open("/etc/ld-musl-mips-sf.path", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib/libubox.so", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 3
fcntl64(3, F_SETFD, FD_CLOEXEC) = 0
fstat64(3, {st_mode=S_IFREG|0644, st_size=33336, ...}) = 0
read(3, "\177ELF\1\2\1\0\0\0\0\0\0\0\0\0\0\3\0\10\0\0\0\1\0\0\0360\0\0\0004"..., 936) = 936
mmap2(NULL, 102400, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x775f2000
mmap2(0x77609000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0x7000) = 0x77609000
close(3) = 0
open("/lib/libnl-tiny.so", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/usr/local/lib/libnl-tiny.so", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/usr/lib/libnl-tiny.so", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 3
fcntl64(3, F_SETFD, FD_CLOEXEC) = 0
fstat64(3, {st_mode=S_IFREG|0755, st_size=25440, ...}) = 0
read(3, "\177ELF\1\2\1\0\0\0\0\0\0\0\0\0\0\3\0\10\0\0\0\1\0\0\33\220\0\0\0004"..., 936) = 936
mmap2(NULL, 94208, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x775da000
mmap2(0x775ef000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0x5000) = 0x775ef000
close(3) = 0
open("/lib/libz.so.1", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/usr/local/lib/libz.so.1", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/usr/lib/libz.so.1", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 3
fcntl64(3, F_SETFD, FD_CLOEXEC) = 0
fstat64(3, {st_mode=S_IFREG|0755, st_size=61720, ...}) = 0
read(3, "\177ELF\1\2\1\0\0\0\0\0\0\0\0\0\0\3\0\10\0\0\0\1\0\0\24\200\0\0\0004"..., 936) = 936
mmap2(NULL, 131072, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x775ba000
mmap2(0x775d8000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0xe000) = 0x775d8000
close(3) = 0
open("/lib/libgcc_s.so.1", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 3
fcntl64(3, F_SETFD, FD_CLOEXEC) = 0
fstat64(3, {st_mode=S_IFREG|0644, st_size=77040, ...}) = 0
read(3, "\177ELF\1\2\1\0\0\0\0\0\0\0\0\0\0\3\0\10\0\0\0\1\0\0(\20\0\0\0004"..., 936) = 936
mmap2(NULL, 143360, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x77596000
mmap2(0x775b8000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0x12000) = 0x775b8000
close(3) = 0
mprotect(0x77609000, 4096, PROT_READ) = 0
mprotect(0x775ef000, 4096, PROT_READ) = 0
mprotect(0x775d8000, 4096, PROT_READ) = 0
mprotect(0x415000, 4096, PROT_READ) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1516199625, tv_nsec=124123469}) = 0
getpid() = 11513
socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE) = 3
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [32768], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [32768], 4) = 0
bind(3, {sa_family=AF_NETLINK, nl_pid=11513, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=11513, nl_groups=00000000}, [12]) = 0
open("/usr/share/nlbwmon/protocols", O_RDONLY|O_LARGEFILE) = 4
readv(4, [{iov_base="", iov_len=0}, {iov_base="1 0 ICMP\n2 0 IGMP\n4 0 IP-in-IP\n6"..., iov_len=1024}], 2) = 664
readv(4, [{iov_base="", iov_len=0}, {iov_base="", iov_len=1024}], 2) = 0
close(4) = 0
socket(AF_UNIX, SOCK_STREAM, 0) = 4
fcntl64(4, F_GETFD) = 0
fcntl64(4, F_SETFD, FD_CLOEXEC) = 0
connect(4, {sa_family=AF_UNIX, sun_path="/var/run/nlbwmon.sock"}, 110) = -1 ECONNREFUSED (Connection refused)
close(4) = 0
sendto(-1, "dump 0", 6, 0, NULL, 0) = -1 EBADF (Bad file descriptor)
close(-1) = -1 EBADF (Bad file descriptor)
writev(2, [{iov_base="Error while processing command: "..., iov_len=52}, {iov_base=NULL, iov_len=0}], 2Error while processing command: Bad file descriptor
) = 52
exit_group(-9) = ?
+++ exited with 247 +++
root@darkstar:/tmp/run# ls -alti nlbwmon.sock
219873 srwxr-xr-x 1 root root 0 Jan 17 15:30 nlbwmon.sock
It's there somewhere
root@darkstar:/tmp/run# netstat -a -e
snip
Active UNIX domain sockets (servers and established)
Proto RefCnt Flags Type State I-Node Path
unix 9 [ ] DGRAM 1034 /dev/log
unix 2 [ ACC ] STREAM LISTENING 308 /var/run/ubus.sock
unix 2 [ ] DGRAM 2422 /var/run/hostapd/wlan0
unix 2 [ ACC ] STREAM LISTENING 174973 /var/run/nlbwmon.sock
unix 2 [ ] DGRAM 2269 /var/run/hostapd/wlan1
unix 3 [ ] STREAM CONNECTED 2396
unix 3 [ ] STREAM CONNECTED 1215 /var/run/ubus.sock
unix 3 [ ] STREAM CONNECTED 1062
unix 2 [ ] DGRAM 3127
unix 2 [ ] DGRAM 1804
unix 3 [ ] STREAM CONNECTED 1037 /var/run/ubus.sock
unix 3 [ ] STREAM CONNECTED 315 /var/run/ubus.sock
unix 2 [ ] DGRAM 1274
unix 2 [ ] DGRAM 1908
unix 3 [ ] STREAM CONNECTED 1063 /var/run/ubus.sock
unix 3 [ ] STREAM CONNECTED 1036
unix 2 [ ] DGRAM 2250
unix 3 [ ] STREAM CONNECTED 314
unix 3 [ ] STREAM CONNECTED 2449
unix 3 [ ] STREAM CONNECTED 1168 /var/run/ubus.sock
unix 3 [ ] STREAM CONNECTED 1167
unix 2 [ ] DGRAM 96374
unix 3 [ ] STREAM CONNECTED 1801 /var/run/ubus.sock
unix 3 [ ] STREAM CONNECTED 1045 /var/run/ubus.sock
unix 2 [ ] DGRAM 313
unix 2 [ ] DGRAM 2211
unix 2 [ ] DGRAM 1164
unix 3 [ ] STREAM CONNECTED 1214
unix 3 [ ] STREAM CONNECTED 2450 /var/run/ubus.sock
unix 3 [ ] STREAM CONNECTED 1044
unix 3 [ ] STREAM CONNECTED 1800
unix 3 [ ] STREAM CONNECTED 2397 /var/run/ubus.sock
root@darkstar:/proc/9277# lsof -c nlbwmon
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
nlbwmon 9277 root cwd DIR 0,14 0 198 /
nlbwmon 9277 root rtd DIR 0,14 0 198 /
nlbwmon 9277 root txt REG 31,3 24949 1175 /usr/sbin/nlbwmon
nlbwmon 9277 root mem REG 31,2 77040 619 /lib/libgcc_s.so.1
nlbwmon 9277 root mem REG 31,3 61720 852 /usr/lib/libz.so.1.2.11
nlbwmon 9277 root mem REG 31,2 25440 297 /usr/lib/libnl-tiny.so
nlbwmon 9277 root mem REG 31,2 33336 622 /lib/libubox.so
nlbwmon 9277 root mem REG 31,2 601968 661 /lib/libc.so
nlbwmon 9277 root mem REG 0,13 27 962 /tmp/TZ
nlbwmon 9277 root 0r CHR 1,3 0t0 256 /dev/null
nlbwmon 9277 root 1w CHR 1,3 0t0 256 /dev/null
nlbwmon 9277 root 2w FIFO 0,8 0t0 174968 pipe
nlbwmon 9277 root 3u netlink 0t0 174970 ROUTE
nlbwmon 9277 root 4u a_inode 0,9 0 11 [eventpoll]
nlbwmon 9277 root 5r FIFO 0,8 0t0 174971 pipe
nlbwmon 9277 root 6w FIFO 0,8 0t0 174971 pipe
nlbwmon 9277 root 7u netlink 0t0 174972 NETFILTER
nlbwmon 9277 root 8u unix 0x86c098c0 0t0 174973 /var/run/nlbwmon.sock type=STREAM
Can you run strace -p $(pidof nlbwmon)
while you get connection refused with the client?
Not sure what kind of answer you were expecting here, but yes, I can.
I occasionally see epoll_pwait going past, and occasionally a flood of data rolls by.
here's a 30-second output.
And another when I did two page refreshes for http://..../cgi-bin/luci/admin/nlbw/display
Could this be something to do with the inodes changing?
root@darkstar:/tmp/run# ls -li nlbwmon.sock
221911 srwxr-xr-x 1 root root 0 Jan 17 17:04 nlbwmon.sock
root@darkstar:/tmp/run# lsof -c nlbw
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
nlbwmon 9277 root cwd DIR 0,14 0 198 /
nlbwmon 9277 root rtd DIR 0,14 0 198 /
nlbwmon 9277 root txt REG 31,3 24949 1175 /usr/sbin/nlbwmon
nlbwmon 9277 root mem REG 31,2 77040 619 /lib/libgcc_s.so.1
nlbwmon 9277 root mem REG 31,3 61720 852 /usr/lib/libz.so.1.2.11
nlbwmon 9277 root mem REG 31,2 25440 297 /usr/lib/libnl-tiny.so
nlbwmon 9277 root mem REG 31,2 33336 622 /lib/libubox.so
nlbwmon 9277 root mem REG 31,2 601968 661 /lib/libc.so
nlbwmon 9277 root mem REG 0,13 27 962 /tmp/TZ
nlbwmon 9277 root 0r CHR 1,3 0t0 256 /dev/null
nlbwmon 9277 root 1w CHR 1,3 0t0 256 /dev/null
nlbwmon 9277 root 2w FIFO 0,8 0t0 174968 pipe
nlbwmon 9277 root 3u netlink 0t0 174970 ROUTE
nlbwmon 9277 root 4u a_inode 0,9 0 11 [eventpoll]
nlbwmon 9277 root 5r FIFO 0,8 0t0 174971 pipe
nlbwmon 9277 root 6w FIFO 0,8 0t0 174971 pipe
nlbwmon 9277 root 7u netlink 0t0 174972 NETFILTER
nlbwmon 9277 root 8u unix 0x86c098c0 0t0 174973 /var/run/nlbwmon.sock type=STREAM
proc reports something different to what the socket file actually has. So perhaps the nlbwmon has run away...
Curiously, after a reboot - I go to display, still "grey". Run strace -p $(pidof nlbwmon)
and everything displays fine... go figure.
Any residual logread messages? Is it the Unable to handle 'dump' command: Resource temporarily unavailable
again?
Yep:
Wed Jan 17 19:53:37 2018 daemon.err nlbwmon[1640]: Unable to handle 'dump' command: Resource temporarily unavailable
Wed Jan 17 19:53:37 2018 daemon.err uhttpd[1405]: Error while processing command: No data available
Just not when strace
is running.
I have noticed this problem for a while. Only thing I have found that helps is to delete the database and start fresh, after that it will run smoothly again for a few days...
I had this. I ended up finding that restarting nlbwmon did the trick to get it behaving.
AFAICT, it doesn't affect data capture, only display.
Not sure if it's related, but I'm finding that the graphical display won't show old data. It works fine for current period, but when requesting old periods, it displays the current period instead.
I had the same problem on openwrt cc(15.05),and I try it on openwrt 18.06 ,it works okay. I suspect it's a problem caused by the kernel version ,old kernel version(3.18.84)doesn't send netlink message to user space nlbwmon.
I got it, the kernel config must be selected "CONFIG_NF_CONNTRACK_EVENTS". openwrt cc default no select it
I seem to have the same problem on 18.06-rc1 on ipq8064. However i have CONFIG_NF_CONNTRACK_EVENTS=y in build_dir/target-arm_cortex-a15+neon-vfpv4_musl_eabi/linux-ipq806x/linux-4.14.50/.config
I still get this issue intermittently. I am on OpenWrt 18.06.0
As reported by @systemcrash running strace -p $(pidof nlbwmon)
temporarily resolves the issue.
I got this issue too after move the path database to usr/lib/nlbwmon After run for 3 days. Suddenly the graph turned to be grey and no data anymore The system log is Resource temporarily unavailable
Hope u have solution for next update run strace while checking is kinda inefficient
I believe this issue has been addressed by ef3fa582b54a6899e7042726157909637974a278. Please open a new issue if you still see this.
The issue persists even after ef3fa58
Running for a few days and I get this in syslog when the GUI shows 'No data recorded yet.' and 'Force reload' for MAC data:
Wed Jan 17 15:17:55 2018 daemon.err nlbwmon[9277]: Unable to handle 'dump' command: Resource temporarily unavailable Wed Jan 17 15:17:55 2018 daemon.err uhttpd[1293]: Error while processing command: No data available Wed Jan 17 15:17:59 2018 daemon.err nlbwmon[9277]: Unable to handle 'dump' command: Resource temporarily unavailable Wed Jan 17 15:17:59 2018 daemon.err uhttpd[1293]: Error while processing command: No data available
running under console completes fine:
./usr/sbin/nlbwmon dump
(lots of data)
Have relocated storage to /opt onto removable USB which is accessible and fine.