Open neo954 opened 5 years ago
I tried to get the tty state with stty -a
before and after I run rcons
. But the two of outputs looked exactly same.
root@f6u13k13:~# stty -a >stty.out.good
root@f6u13k13:~# rcons f6u13k15
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:31:18-04:00): Hello 10.6.13.13:33358, welcome to the session of f6u13k15
[Disconnected]
root@f6u13k13:~# stty -a >stty.out.1
root@f6u13k13:~# diff -u stty.out.good stty.out.1
root@f6u13k13:~# echo $?
0
root@f6u13k13:~# cat stty.out.good
speed 38400 baud; rows 62; columns 135; line = 0;
intr = ^C; quit = ^\; erase = ^?; kill = ^U; eof = ^D; eol = M-^?; eol2 = M-^?;
swtch = <undef>; start = ^Q; stop = ^S; susp = ^Z; rprnt = ^R; werase = ^W;
lnext = ^V; discard = ^O; min = 1; time = 0;
-parenb -parodd -cmspar cs8 -hupcl -cstopb cread -clocal -crtscts
-ignbrk -brkint -ignpar -parmrk -inpck -istrip -inlcr -igncr icrnl ixon -ixoff
-iuclc ixany imaxbel iutf8
opost -olcuc -ocrnl onlcr -onocr -onlret -ofill -ofdel nl0 cr0 tab0 bs0 vt0 ff0
isig icanon iexten echo echoe echok -echonl -noflsh -xcase -tostop -echoprt
echoctl echoke -flusho -extproc
root@f6u13k13:~# cat stty.out.1
speed 38400 baud; rows 62; columns 135; line = 0;
intr = ^C; quit = ^\; erase = ^?; kill = ^U; eof = ^D; eol = M-^?; eol2 = M-^?;
swtch = <undef>; start = ^Q; stop = ^S; susp = ^Z; rprnt = ^R; werase = ^W;
lnext = ^V; discard = ^O; min = 1; time = 0;
-parenb -parodd -cmspar cs8 -hupcl -cstopb cread -clocal -crtscts
-ignbrk -brkint -ignpar -parmrk -inpck -istrip -inlcr -igncr icrnl ixon -ixoff
-iuclc ixany imaxbel iutf8
opost -olcuc -ocrnl onlcr -onocr -onlret -ofill -ofdel nl0 cr0 tab0 bs0 vt0 ff0
isig icanon iexten echo echoe echok -echonl -noflsh -xcase -tostop -echoprt
echoctl echoke -flusho -extproc
I tried to run strace tftp
when the tty is in the broken state. It seems the read()
system calls of tftp client process failed continuously and the errno
was set to EAGAIN
.
... <<< omit thousands of lines >>> ...
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
read(0, 0x397cdea1410, 1024) = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> ) = 6
... <<< omit thousands of lines >>> ...
Enclosed please find the strace
outputs.
strace.tftp.out.good.txt
strace.tftp.out.problem.txt
This problem can be recreated with cat
.
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:59:08-04:00): Hello 10.6.13.13:38404, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# cat
cat: -: Resource temporarily unavailable
root@f6u13k13:~# strace cat
execve("/bin/cat", ["cat"], 0x7fffc7e70f20 /* 28 vars */) = 0
brk(NULL) = 0xd9e3c350000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=28770, ...}) = 0
mmap(NULL, 28770, PROT_READ, MAP_PRIVATE, 3, 0) = 0x794af1900000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/powerpc64le-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0\25\0\1\0\0\0\20G\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2181704, ...}) = 0
mmap(NULL, 2250384, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x794af16d0000
mmap(0x794af18e0000, 131072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x794af18e0000
close(3) = 0
mprotect(0x794af18e0000, 65536, PROT_READ) = 0
mprotect(0xd9e01270000, 65536, PROT_READ) = 0
mprotect(0x794af1970000, 65536, PROT_READ) = 0
munmap(0x794af1900000, 28770) = 0
openat(AT_FDCWD, "/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=1991136, ...}) = 0
mmap(NULL, 1991136, PROT_READ, MAP_PRIVATE, 3, 0) = 0x794af14e0000
close(3) = 0
brk(NULL) = 0xd9e3c350000
brk(0xd9e3c380000) = 0xd9e3c380000
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 5), ...}) = 0
fstat(0, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 5), ...}) = 0
fadvise64(0, 0, 0, POSIX_FADV_SEQUENTIAL) = 0
mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x794af14a0000
read(0, 0x794af14b0000, 131072) = -1 EAGAIN (Resource temporarily unavailable)
write(2, "cat: ", 5cat: ) = 5
write(2, "-", 1-) = 1
openat(AT_FDCWD, "/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=2995, ...}) = 0
read(3, "# Locale name alias data base.\n#"..., 4096) = 2995
read(3, "", 4096) = 0
close(3) = 0
openat(AT_FDCWD, "/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale-langpack/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale-langpack/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, ": Resource temporarily unavailab"..., 34: Resource temporarily unavailable) = 34
write(2, "\n", 1
) = 1
munmap(0x794af14a0000, 262144) = 0
close(0) = 0
close(1) = 0
close(2) = 0
exit_group(1) = ?
+++ exited with 1 +++
Okay, here is the problem.
root@f6u13k13:~# cat /proc/self/fdinfo/0
pos: 0
flags: 02
mnt_id: 26
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:21:47-04:00): Hello 10.6.13.13:41330, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# cat /proc/self/fdinfo/0
pos: 0
flags: 024002
mnt_id: 26
It seems the problem affected all three file descriptors 0, 1, and 2.
root@f6u13k13:~# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 02
mnt_id: 26
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 02
mnt_id: 26
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 02
mnt_id: 26
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:23:16-04:00): Hello 10.6.13.13:41500, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 024002
mnt_id: 26
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 024002
mnt_id: 26
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 024002
mnt_id: 26
See http://man7.org/linux/man-pages/man5/proc.5.html for details of the fdinfo
subdirectory.
flags This is an octal number that displays the file access mode and file status flags (see open(2)). If the close-on-exec file descriptor flag is set, then flags will also include the value
O_CLOEXEC
. Before Linux 3.1, this field incorrectly displayed the setting ofO_CLOEXEC
at the time the file was opened, rather than the current setting of the close-on-exec flag.
In file /usr/src/linux-headers-4.15.0-47/include/uapi/asm-generic/fcntl.h
#define O_RDWR 00000002
#ifndef O_NONBLOCK
#define O_NONBLOCK 00004000
#endif
#ifndef FASYNC
#define FASYNC 00020000 /* fcntl, for BSD compatibility */
#endif
root@f6u13k13:~# uname -a
Linux f6u13k13 4.15.0-47-generic #50-Ubuntu SMP Wed Mar 13 10:40:40 UTC 2019 ppc64le ppc64le ppc64le GNU/Linux
This is what happened on a RHEL 8 testing environment.
[root@c910f03c01p19 ~]# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 02
mnt_id: 25
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 02
mnt_id: 25
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 02
mnt_id: 25
[root@c910f03c01p19 ~]# rcons c910f03c01p10
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:47:27-04:00): Hello 10.3.1.19:59050, welcome to the session of c910f03c01p10
done
[Disconnected]
[root@c910f03c01p19 ~]# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 020002
mnt_id: 25
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 020002
mnt_id: 25
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 020002
mnt_id: 25
[root@c910f03c01p19 ~]# uname -a
Linux c910f03c01p19 4.18.0-80.el8.ppc64le #1 SMP Wed Mar 13 11:26:21 UTC 2019 ppc64le ppc64le ppc64le GNU/Linux
In the latest goconserver
source, v0.3.2
.
$ grep -n -r NONBLOCK .
./console/client.go:132: err := common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
./console/client.go:173: err := common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
./console/cli.go:320: err = common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
@chenglch, Do you have any idea about this issue? :-/
It seems we should blame bash
on Ubuntu. If run zsh
on Ubuntu, the behavior will be the same as RHEL 8.
But the O_ASYNC
flag is still there.
root@c910f03c01p06:~# zsh
c910f03c01p06# head -99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 02
mnt_id: 25
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 02
mnt_id: 25
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 02
mnt_id: 25
c910f03c01p06# rcons c910f03c01p07
[Enter `^Ec?' for help]
goconserver(2019-05-22T01:33:48-04:00): Hello 10.3.1.6:44324, welcome to the session of c910f03c01p07
c910f03c01p07: Logon failed
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
c910f03c01p06# head -99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos: 0
flags: 020002
mnt_id: 25
==> /proc/self/fdinfo/1 <==
pos: 0
flags: 020002
mnt_id: 25
==> /proc/self/fdinfo/2 <==
pos: 0
flags: 020002
mnt_id: 25
Here is a simple test program.
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/ioctl.h>
int main(int argc, char **argv)
{
int flags = fcntl(0, F_GETFL, 0);
printf("stdin was formerly %sblocking\n", (flags & O_NONBLOCK) ? "non" : "");
int nb = argc < 2 ? 0 : atoi(argv[1]);
ioctl(0, FIONBIO, &nb);
flags = fcntl(0, F_GETFL, 0);
printf("stdin is now %sblocking\n", (flags & O_NONBLOCK) ? "non" : "");
}
On Ubuntu 18.04.2
root@c910f03c01p06:~# cat /proc/self/fdinfo/1
pos: 0
flags: 020002
mnt_id: 25
root@c910f03c01p06:~# ./a.out 1
stdin was formerly blocking
stdin is now nonblocking
root@c910f03c01p06:~# cat /proc/self/fdinfo/1
pos: 0
flags: 024002
mnt_id: 25
On RHEL 8
[root@c910f03c01p19 ~]# cat /proc/self/fdinfo/1
pos: 0
flags: 02
mnt_id: 25
[root@c910f03c01p19 ~]# ./a.out 1
stdin was formerly blocking
stdin is now nonblocking
[root@c910f03c01p19 ~]# cat /proc/self/fdinfo/1
pos: 0
flags: 02
mnt_id: 25
Here is the bug recreation steps.
Run
rcons
against a compute node, and then, pressCtrl-E C ?
. After that, run tftp client,tftp
. The tftp client will print out command line prompt repeatedly and endlessly.Additional information
The xCAT management node runs Ubuntu 18.04.2 on a ppc64el node. It has xCAT
2.15-snap201905170621
installed. The xCAT compute nodef6u13k15
is a regular KVM guest.