xcat2 / xcat-core

Code repo for xCAT core packages
Eclipse Public License 1.0
367 stars 172 forks source link

`rcons` makes stdin, stdout, and stderr O_ASYNC | O_NONBLOCK #6326

Open neo954 opened 5 years ago

neo954 commented 5 years ago

Here is the bug recreation steps.

Run rcons against a compute node, and then, press Ctrl-E C ?. After that, run tftp client, tftp. The tftp client will print out command line prompt repeatedly and endlessly.

root@f6u13k13:~# tftp
tftp>
tftp> 
root@f6u13k13:~# rcons f6u13k15
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:12:05-04:00): Hello 10.6.13.13:33358, welcome to the session of f6u13k15
[Disconnected]
root@f6u13k13:~# tftp
tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp> tftp>
... <<< omit thousands of lines >>> ...

Additional information

The xCAT management node runs Ubuntu 18.04.2 on a ppc64el node. It has xCAT 2.15-snap201905170621 installed. The xCAT compute node f6u13k15 is a regular KVM guest.

root@f6u13k13:~# cat /etc/os-release
NAME="Ubuntu"
VERSION="18.04.2 LTS (Bionic Beaver)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 18.04.2 LTS"
VERSION_ID="18.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=bionic
UBUNTU_CODENAME=bionic
root@f6u13k13:~# go-xcat check
Operating system:   linux
Architecture:       ppc64le
Linux Distribution: ubuntu
Version:            18.04
go-xcat Version:    1.0.38

Reading repositories ...... done

xCAT Core Packages
==================

Package Name                Installed                      In Repository
------------                ---------                      -------------
perl-xcat                   2.15-snap201905170621          2.15-snap201905170621
xcat                        2.15-snap201905170621          2.15-snap201905170621
xcat-buildkit               2.15-snap201905170621          2.15-snap201905170621
xcat-client                 2.15-snap201905170621          2.15-snap201905170621
xcat-confluent              (not installed)                2.15-snap201905170621
xcat-genesis-scripts-amd64  2.15-snap201905170621          2.15-snap201905170621
xcat-genesis-scripts-ppc64  2.15-snap201905170621          2.15-snap201905170621
xcat-probe                  2.15-snap201905170621          2.15-snap201905170621
xcat-server                 2.15-snap201905170621          2.15-snap201905170621
xcat-test                   2.15-snap201905170621          2.15-snap201905170621
xcat-vlan                   (not installed)                2.15-snap201905170621
xcatsn                      (not installed)                2.15-snap201905170621

xCAT Dependency Packages
========================

Package Name                Installed                      In Repository
------------                ---------                      -------------
elilo-xcat                  3.14-4                         3.14-4
goconserver                 0.3.2-snap201811080419         0.3.2-snap201811080419
grub2-xcat                  2.02-0.76.el7.1.snap2019051602 2.02-0.76.el7.1.snap2019051602
ipmitool-xcat               1.8.18                         1.8.18
syslinux-xcat               3.86-2                         3.86-2
xcat-genesis-base-amd64     2.14.5-snap201811190037        2.14.5-snap201811190037
xcat-genesis-base-ppc64     2.14.5-snap201811160710        2.14.5-snap201811160710
xnba-undi                   1.0.3-7                        1.0.3-7
root@f6u13k13:~# lsdef f6u13k15 -z
# <xCAT data object stanza file>

f6u13k15:
    objtype=node
    addkcmdline=console=tty0 console=hvc0,115200
    arch=ppc64el
    cons=kvm
    consoleenabled=1
    currchain=boot
    currstate=netboot ubuntu18.04.2-ppc64el-compute
    groups=all
    ip=10.6.13.15
    mac=42:11:0a:06:0d:0f|42:02:0a:06:0d:0f!*NOIP*|42:a2:0a:06:0d:0f!*NOIP*
    mgt=kvm
    monserver=f6u13k13
    netboot=grub2
    nfsserver=f6u13k13
    os=ubuntu18.04.2
    profile=compute
    provmethod=ubuntu18.04.2-ppc64el-netboot-compute
    serialport=0
    serialspeed=115200
    status=powering-on
    statustime=05-21-2019 03:44:56
    tftpserver=f6u13k13
    updatestatus=failed
    updatestatustime=05-20-2019 14:18:04
    vmcpus=2
    vmhost=f6u13
    vmmemory=4096
    vmnicnicmodel=virtio-net-pci
    vmnics=br0,private_br0,private_br1
    vmstorage=phy:/dev/mapper/vdiskvg00-vdisk00n15
    xcatmaster=f6u13k13
neo954 commented 5 years ago

I tried to get the tty state with stty -a before and after I run rcons. But the two of outputs looked exactly same.

root@f6u13k13:~# stty -a >stty.out.good
root@f6u13k13:~# rcons f6u13k15
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:31:18-04:00): Hello 10.6.13.13:33358, welcome to the session of f6u13k15
[Disconnected]
root@f6u13k13:~# stty -a >stty.out.1
root@f6u13k13:~# diff -u stty.out.good stty.out.1
root@f6u13k13:~# echo $?
0
root@f6u13k13:~# cat stty.out.good
speed 38400 baud; rows 62; columns 135; line = 0;
intr = ^C; quit = ^\; erase = ^?; kill = ^U; eof = ^D; eol = M-^?; eol2 = M-^?;
swtch = <undef>; start = ^Q; stop = ^S; susp = ^Z; rprnt = ^R; werase = ^W;
lnext = ^V; discard = ^O; min = 1; time = 0;
-parenb -parodd -cmspar cs8 -hupcl -cstopb cread -clocal -crtscts
-ignbrk -brkint -ignpar -parmrk -inpck -istrip -inlcr -igncr icrnl ixon -ixoff
-iuclc ixany imaxbel iutf8
opost -olcuc -ocrnl onlcr -onocr -onlret -ofill -ofdel nl0 cr0 tab0 bs0 vt0 ff0
isig icanon iexten echo echoe echok -echonl -noflsh -xcase -tostop -echoprt
echoctl echoke -flusho -extproc
root@f6u13k13:~# cat stty.out.1
speed 38400 baud; rows 62; columns 135; line = 0;
intr = ^C; quit = ^\; erase = ^?; kill = ^U; eof = ^D; eol = M-^?; eol2 = M-^?;
swtch = <undef>; start = ^Q; stop = ^S; susp = ^Z; rprnt = ^R; werase = ^W;
lnext = ^V; discard = ^O; min = 1; time = 0;
-parenb -parodd -cmspar cs8 -hupcl -cstopb cread -clocal -crtscts
-ignbrk -brkint -ignpar -parmrk -inpck -istrip -inlcr -igncr icrnl ixon -ixoff
-iuclc ixany imaxbel iutf8
opost -olcuc -ocrnl onlcr -onocr -onlret -ofill -ofdel nl0 cr0 tab0 bs0 vt0 ff0
isig icanon iexten echo echoe echok -echonl -noflsh -xcase -tostop -echoprt
echoctl echoke -flusho -extproc
neo954 commented 5 years ago

I tried to run strace tftp when the tty is in the broken state. It seems the read() system calls of tftp client process failed continuously and the errno was set to EAGAIN.

... <<< omit thousands of lines >>> ...
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
read(0, 0x397cdea1410, 1024)            = -1 EAGAIN (Resource temporarily unavailable)
write(1, "tftp> ", 6tftp> )                   = 6
... <<< omit thousands of lines >>> ...
neo954 commented 5 years ago

Enclosed please find the strace outputs. strace.tftp.out.good.txt strace.tftp.out.problem.txt

neo954 commented 5 years ago

This problem can be recreated with cat.

root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T04:59:08-04:00): Hello 10.6.13.13:38404, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# cat
cat: -: Resource temporarily unavailable
neo954 commented 5 years ago
root@f6u13k13:~# strace cat
execve("/bin/cat", ["cat"], 0x7fffc7e70f20 /* 28 vars */) = 0
brk(NULL)                               = 0xd9e3c350000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=28770, ...}) = 0
mmap(NULL, 28770, PROT_READ, MAP_PRIVATE, 3, 0) = 0x794af1900000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/powerpc64le-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0\25\0\1\0\0\0\20G\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2181704, ...}) = 0
mmap(NULL, 2250384, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x794af16d0000
mmap(0x794af18e0000, 131072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x794af18e0000
close(3)                                = 0
mprotect(0x794af18e0000, 65536, PROT_READ) = 0
mprotect(0xd9e01270000, 65536, PROT_READ) = 0
mprotect(0x794af1970000, 65536, PROT_READ) = 0
munmap(0x794af1900000, 28770)           = 0
openat(AT_FDCWD, "/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=1991136, ...}) = 0
mmap(NULL, 1991136, PROT_READ, MAP_PRIVATE, 3, 0) = 0x794af14e0000
close(3)                                = 0
brk(NULL)                               = 0xd9e3c350000
brk(0xd9e3c380000)                      = 0xd9e3c380000
fstat(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 5), ...}) = 0
fstat(0, {st_mode=S_IFCHR|0600, st_rdev=makedev(136, 5), ...}) = 0
fadvise64(0, 0, 0, POSIX_FADV_SEQUENTIAL) = 0
mmap(NULL, 262144, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x794af14a0000
read(0, 0x794af14b0000, 131072)         = -1 EAGAIN (Resource temporarily unavailable)
write(2, "cat: ", 5cat: )                    = 5
write(2, "-", 1-)                        = 1
openat(AT_FDCWD, "/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=2995, ...}) = 0
read(3, "# Locale name alias data base.\n#"..., 4096) = 2995
read(3, "", 4096)                       = 0
close(3)                                = 0
openat(AT_FDCWD, "/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale-langpack/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale-langpack/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, ": Resource temporarily unavailab"..., 34: Resource temporarily unavailable) = 34
write(2, "\n", 1
)                       = 1
munmap(0x794af14a0000, 262144)          = 0
close(0)                                = 0
close(1)                                = 0
close(2)                                = 0
exit_group(1)                           = ?
+++ exited with 1 +++
neo954 commented 5 years ago

Okay, here is the problem.

root@f6u13k13:~# cat /proc/self/fdinfo/0
pos:    0
flags:  02
mnt_id: 26
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:21:47-04:00): Hello 10.6.13.13:41330, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# cat /proc/self/fdinfo/0
pos:    0
flags:  024002
mnt_id: 26
neo954 commented 5 years ago

It seems the problem affected all three file descriptors 0, 1, and 2.

root@f6u13k13:~# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos:    0
flags:  02
mnt_id: 26

==> /proc/self/fdinfo/1 <==
pos:    0
flags:  02
mnt_id: 26

==> /proc/self/fdinfo/2 <==
pos:    0
flags:  02
mnt_id: 26
root@f6u13k13:~# rcons f6u13k14
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:23:16-04:00): Hello 10.6.13.13:41500, welcome to the session of f6u13k14
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
root@f6u13k13:~# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos:    0
flags:  024002
mnt_id: 26

==> /proc/self/fdinfo/1 <==
pos:    0
flags:  024002
mnt_id: 26

==> /proc/self/fdinfo/2 <==
pos:    0
flags:  024002
mnt_id: 26
neo954 commented 5 years ago

See http://man7.org/linux/man-pages/man5/proc.5.html for details of the fdinfo subdirectory.

flags This is an octal number that displays the file access mode and file status flags (see open(2)). If the close-on-exec file descriptor flag is set, then flags will also include the value O_CLOEXEC. Before Linux 3.1, this field incorrectly displayed the setting of O_CLOEXEC at the time the file was opened, rather than the current setting of the close-on-exec flag.

neo954 commented 5 years ago

In file /usr/src/linux-headers-4.15.0-47/include/uapi/asm-generic/fcntl.h

#define O_RDWR          00000002
#ifndef O_NONBLOCK
#define O_NONBLOCK      00004000
#endif
#ifndef FASYNC
#define FASYNC          00020000        /* fcntl, for BSD compatibility */
#endif
neo954 commented 5 years ago
root@f6u13k13:~# uname -a
Linux f6u13k13 4.15.0-47-generic #50-Ubuntu SMP Wed Mar 13 10:40:40 UTC 2019 ppc64le ppc64le ppc64le GNU/Linux
neo954 commented 5 years ago

This is what happened on a RHEL 8 testing environment.

[root@c910f03c01p19 ~]# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos:    0
flags:  02
mnt_id: 25

==> /proc/self/fdinfo/1 <==
pos:    0
flags:  02
mnt_id: 25

==> /proc/self/fdinfo/2 <==
pos:    0
flags:  02
mnt_id: 25
[root@c910f03c01p19 ~]# rcons c910f03c01p10
[Enter `^Ec?' for help]
goconserver(2019-05-21T05:47:27-04:00): Hello 10.3.1.19:59050, welcome to the session of c910f03c01p10
done
[Disconnected]
[root@c910f03c01p19 ~]# head -n 99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos:    0
flags:  020002
mnt_id: 25

==> /proc/self/fdinfo/1 <==
pos:    0
flags:  020002
mnt_id: 25

==> /proc/self/fdinfo/2 <==
pos:    0
flags:  020002
mnt_id: 25
[root@c910f03c01p19 ~]# uname -a
Linux c910f03c01p19 4.18.0-80.el8.ppc64le #1 SMP Wed Mar 13 11:26:21 UTC 2019 ppc64le ppc64le ppc64le GNU/Linux
neo954 commented 5 years ago

In the latest goconserver source, v0.3.2.

$ grep -n -r NONBLOCK .
./console/client.go:132:    err := common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
./console/client.go:173:    err := common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
./console/cli.go:320:   err = common.Fcntl(in, syscall.F_SETFL, syscall.O_ASYNC|syscall.O_NONBLOCK)
neo954 commented 5 years ago

@chenglch, Do you have any idea about this issue? :-/

neo954 commented 5 years ago

It seems we should blame bash on Ubuntu. If run zsh on Ubuntu, the behavior will be the same as RHEL 8. But the O_ASYNC flag is still there.

root@c910f03c01p06:~# zsh
c910f03c01p06# head -99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos:    0
flags:  02
mnt_id: 25

==> /proc/self/fdinfo/1 <==
pos:    0
flags:  02
mnt_id: 25

==> /proc/self/fdinfo/2 <==
pos:    0
flags:  02
mnt_id: 25
c910f03c01p06# rcons c910f03c01p07
[Enter `^Ec?' for help]
goconserver(2019-05-22T01:33:48-04:00): Hello 10.3.1.6:44324, welcome to the session of c910f03c01p07
c910f03c01p07: Logon failed
Could not receive message, error: EOF.
[Enter `^Ec.' to exit]
Session is teminated unexpectedly, retrying....
[Disconnected]
c910f03c01p06# head -99 /proc/self/fdinfo/{0,1,2}
==> /proc/self/fdinfo/0 <==
pos:    0
flags:  020002
mnt_id: 25

==> /proc/self/fdinfo/1 <==
pos:    0
flags:  020002
mnt_id: 25

==> /proc/self/fdinfo/2 <==
pos:    0
flags:  020002
mnt_id: 25
neo954 commented 5 years ago

Here is a simple test program.

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/ioctl.h>

int main(int argc, char **argv)
{

   int flags = fcntl(0, F_GETFL, 0);
   printf("stdin was formerly %sblocking\n", (flags & O_NONBLOCK) ? "non" : "");

   int nb = argc < 2 ? 0 : atoi(argv[1]);
   ioctl(0, FIONBIO, &nb);

   flags = fcntl(0, F_GETFL, 0);
   printf("stdin is now %sblocking\n", (flags & O_NONBLOCK) ? "non" : "");
}

On Ubuntu 18.04.2

root@c910f03c01p06:~# cat /proc/self/fdinfo/1
pos:    0
flags:  020002
mnt_id: 25
root@c910f03c01p06:~# ./a.out 1
stdin was formerly blocking
stdin is now nonblocking
root@c910f03c01p06:~# cat /proc/self/fdinfo/1
pos:    0
flags:  024002
mnt_id: 25

On RHEL 8

[root@c910f03c01p19 ~]# cat /proc/self/fdinfo/1
pos:    0
flags:  02
mnt_id: 25
[root@c910f03c01p19 ~]# ./a.out 1
stdin was formerly blocking
stdin is now nonblocking
[root@c910f03c01p19 ~]# cat /proc/self/fdinfo/1
pos:    0
flags:  02
mnt_id: 25