Closed janv37 closed 3 years ago
Do you see the actual core dump in /cores? If so, could you run pstack
on the core file?
This is in global zone
Managed to enable per process coredumps:
[root@kairoshv /var/tmp]# coreadm
global core file pattern: /var/tmp/%d/%f.%p.%n
global core file content: all
init core file pattern: /%Z/cores/core.%f.%p
init core file content: default
global core dumps: enabled
per-process core dumps: enabled
global setid core dumps: enabled
per-process setid core dumps: disabled
global core dump logging: disabled
[root@kairoshv /var/tmp]# truss -f atq
23387: execve("/usr/bin/atq", 0x08047C3C, 0x08047C44) argc = 1
23387: sysinfo(SI_MACHINE, "i86pc", 257) = 6
23387: mmap(0x00000000, 32, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFEFA0000
23387: mmap(0x00000000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFEF90000
23387: mmap(0x00000000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFEF80000
23387: mmap(0x00000000, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFEF70000
23387: memcntl(0xFEFB2000, 50084, MC_ADVISE, MADV_WILLNEED, 0, 0) = 0
23387: memcntl(0x08050000, 7032, MC_ADVISE, MADV_WILLNEED, 0, 0) = 0
23387: resolvepath("/usr/lib/ld.so.1", "/lib/ld.so.1", 1023) = 12
23387: resolvepath("/usr/bin/atq", "/usr/bin/atq", 1023) = 12
23387: sysconfig(_CONFIG_PAGESIZE) = 4096
23387: stat64("/usr/bin/atq", 0x080478D0) = 0
23387: open("/var/ld/ld.config", O_RDONLY) Err#2 ENOENT
23387: stat64("/usr/gcc/7/lib/libc.so.1", 0x080470F0) Err#2 ENOENT
23387: stat64("/lib/libc.so.1", 0x080470F0) = 0
23387: resolvepath("/lib/libc.so.1", "/lib/libc.so.1", 1023) = 14
23387: open("/lib/libc.so.1", O_RDONLY) = 3
23387: mmapobj(3, MMOBJ_INTERPRET, 0xFEF70BC8, 0x0804715C, 0x00000000) = 0
23387: close(3) = 0
23387: mmap(0x00000000, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFEDF0000
23387: memcntl(0xFEE00000, 267976, MC_ADVISE, MADV_WILLNEED, 0, 0) = 0
23387: mmap(0x00010000, 24576, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON|MAP_ALIGN, -1, 0) = 0xFEDE0000
23387: getcontext(0x08047710)
23387: getrlimit(RLIMIT_STACK, 0x08047708) = 0
23387: getpid() = 23387 [23386]
23387: lwp_private(0, 1, 0xFEDE2A40) = 0x000001C3
23387: getrandom("A9\t0FC7", 4, 0) = 4
23387: setustack(0xFEDE2AA0)
23387: lwp_cond_broadcast(0xFEDF00FC) = 0
23387: lwp_cond_broadcast(0xFEF7062C) = 0
23387: sysi86(SI86FPSTART, 0xFEF67E0C, 0x0000133F, 0x00001F80) = 0x00000001
23387: sysconfig(_CONFIG_PAGESIZE) = 4096
23387: open("/usr/lib/locale//cs_CZ.UTF-8/LC_CTYPE/LCL_DATA", O_RDONLY) = 3
23387: fstat(3, 0x08047548) = 0
23387: mmap(0x00000000, 48860, PROT_READ, MAP_PRIVATE, 3, 0) = 0xFEDD0000
23387: close(3) = 0
23387: mmap(0x00010000, 65536, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON|MAP_ALIGN, -1, 0) = 0xFEDB0000
23387: munmap(0xFEDD0000, 48860) = 0
23387: getuid() = 0 [0]
23387: brk(0x00000000) = 134629244
23387: brk(0x08064780) = 0
23387: brk(0x08066780) = 0
23387: getuid() = 0 [0]
23387: getpid() = 23387 [23386]
23387: open("/proc/23387/psinfo", O_RDONLY) = 3
23387: read(3, "\0\0\00201\0\0\0 [ [\0\0".., 336) = 336
23387: close(3) = 0
23387: mmap(0x00010000, 65536, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON|MAP_ALIGN, -1, 0) = 0xFED90000
23387: getuid() = 0 [0]
23387: getuid() = 0 [0]
23387: open64("/var/run/name_service_door", O_RDONLY) = 3
23387: fcntl(3, F_SETFD, 0x00000001) = 0
23387: door_info(3, 0xFEF5B69C) = 0
23387: door_call(3, 0x08047938) = 0
23387: stat64("/etc/cron.d/at.allow", 0x080646E0) Err#2 ENOENT
23387: stat64("/etc/cron.d/at.deny", 0x080646E0) = 0
23387: open64("/etc/cron.d/at.deny", O_RDONLY) = 4
23387: fstat64(4, 0x08047A00) = 0
23387: fstat64(4, 0x08047900) = 0
23387: ioctl(4, TCGETA, 0x080479BE) Err#25 ENOTTY
23387: read(4, " d a e m o n\n b i n\n n".., 4096) = 17
23387: read(4, 0x08064BC4, 4096) = 0
23387: llseek(4, 0, SEEK_CUR) = 17
23387: close(4) = 0
23387: stat64("/usr/gcc/7/lib/libsecdb.so.1", 0x08047168) Err#2 ENOENT
23387: stat64("/lib/libsecdb.so.1", 0x08047168) = 0
23387: resolvepath("/lib/libsecdb.so.1", "/lib/libsecdb.so.1", 1023) = 18
23387: open("/lib/libsecdb.so.1", O_RDONLY) = 4
23387: mmapobj(4, MMOBJ_INTERPRET, 0xFEDF0CE8, 0x080471D4, 0x00000000) = 0
23387: close(4) = 0
23387: mmap(0x00000000, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFED80000
23387: memcntl(0xFE8B0000, 7404, MC_ADVISE, MADV_WILLNEED, 0, 0) = 0
23387: lwp_cond_broadcast(0xFED800FC) = 0
23387: stat64("/usr/gcc/7/lib/libnsl.so.1", 0x08041038) Err#2 ENOENT
23387: stat64("/lib/libnsl.so.1", 0x08041038) = 0
23387: resolvepath("/lib/libnsl.so.1", "/lib/libnsl.so.1", 1023) = 16
23387: open("/lib/libnsl.so.1", O_RDONLY) = 4
23387: mmapobj(4, MMOBJ_INTERPRET, 0xFED80698, 0x080410A4, 0x00000000) = 0
23387: close(4) = 0
23387: memcntl(0xFE9B0000, 78612, MC_ADVISE, MADV_WILLNEED, 0, 0) = 0
23387: sigfillset(0xFEF672C8) = 0
23387: lwp_cond_broadcast(0xFED8087C) = 0
23387: getuid() = 0 [0]
23387: getuid() = 0 [0]
23387: door_info(3, 0x08041780) = 0
23387: door_call(3, 0x080417E8) = 0
23387: chdir("/var/spool/cron/atjobs") = 0
23387: open(".", O_RDONLY|O_NDELAY|O_LARGEFILE) = 4
23387: fcntl(4, F_SETFD, 0x00000001) = 0
23387: fstat64(4, 0x080479D0) = 0
23387: fstat64(4, 0x08047AE0) = 0
23387: getdents64(4, 0xFEDE4000, 8192) = 120
23387: stat64("/usr/gcc/7/lib/libbsm.so.1", 0x08047078) Err#2 ENOENT
23387: stat64("/lib/libbsm.so.1", 0x08047078) = 0
23387: resolvepath("/lib/libbsm.so.1", "/lib/libbsm.so.1", 1023) = 16
23387: open("/lib/libbsm.so.1", O_RDONLY) = 5
23387: mmapobj(5, MMOBJ_INTERPRET, 0xFED80E18, 0x080470E4, 0x00000000) = 0
23387: close(5) = 0
23387: mmap(0x00000000, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFED70000
23387: memcntl(0xFE910000, 45828, MC_ADVISE, MADV_WILLNEED, 0, 0) = 0
23387: lwp_cond_broadcast(0xFED700FC) = 0
23387: brk(0x08066780) = 0
23387: brk(0x08068780) = 0
23387: getdents64(4, 0xFEDE4000, 8192) = 0
23387: close(4) = 0
23387: ioctl(1, TCGETA, 0x08046BFE) = 0
23387: fstat64(1, 0x08046B40) = 0
Rank Execution Date Owner Job Queue Job Name
23387: write(1, " R a n k\t E x e c".., 70) = 70
23387: stat64("1616202000.a", 0x08047ADC) = 0
23387: open("/var/run/tzsync", O_RDONLY) = 4
23387: mmap(0x00000000, 4, PROT_READ, MAP_SHARED, 4, 0) = 0xFED60000
23387: close(4) = 0
23387: brk(0x08068780) = 0
23387: brk(0x0806A780) = 0
23387: open("/usr/share/lib/zoneinfo/UTC", O_RDONLY) = 4
23387: fstat64(4, 0x08047900) = 0
23387: read(4, " T Z i f\0\0\0\0\0\0\0\0".., 56) = 56
23387: close(4) = 0
23387: upanic(0xFEEF88A4, 28)
[root@kairoshv /var/tmp]# pflags core.atq.23387
core 'core.atq.23387' of 23387: atq
data model = _ILP32 flags = FORK|RLC|ASYNC|MSACCT|MSFORK
flttrace = 0xfffffbff
sigtrace = 0xfffffeff 0xffffffff 0xffffffff
HUP|INT|QUIT|ILL|TRAP|ABRT|EMT|FPE|BUS|SEGV|SYS|PIPE|ALRM|TERM|USR1|USR2|CLD|PWR|WINCH|URG|POLL|STOP|TSTP|CONT|TTIN|TTOU|VTALRM|PROF|XCPU|XFSZ|WAITING|LWP|FREEZE|THAW|CANCEL|LOST|XRES|JVM1|JVM2|INFO|RTMIN|RTMIN+1|RTMIN+2|RTMIN+3|RTMIN+4|RTMIN+5|RTMIN+6|RTMIN+7|RTMIN+8|RTMIN+9|RTMIN+10|RTMIN+11|RTMIN+12|RTMIN+13|RTMIN+14|RTMIN+15|RTMIN+16|RTMAX-15|RTMAX-14|RTMAX-13|RTMAX-12|RTMAX-11|RTMAX-10|RTMAX-9|RTMAX-8|RTMAX-7|RTMAX-6|RTMAX-5|RTMAX-4|RTMAX-3|RTMAX-2|RTMAX-1|RTMAX
entryset = 0x00000001 0x04000000 0x00000000 0x10000008
0x80000000 0x00000000 0x00000000 0x00000000
exitset = 0xfffffffe 0xffffffff 0xffffffff 0xeffffff7
0x7fffffff 0xffffffff 0xffffffff 0xffffffff
/1: flags = 0
sigmask = 0xffffbefc,0xffffffff,0x000003ff
cursig = SIGABRT
[root@kairoshv /var/tmp]# pstack core.atq.23387
core 'core.atq.23387' of 23387: atq
feeee6c5 syscall (feef88a4, 1c, a, a9, 8047aa8, 8047a8a) + 5
fee7da2f ???????? (8064310, 8047c40, 0, 8065f0c, 0, 60554910)
08052244 printjobname (8065f02)
080526fa printqueue (0, 0) + 9a
08052bcb main (feeee707, fef68648) + 394
08051f97 _start_crt (1, 8047c3c, fefd0117, 0, 0, 0) + 96
08051e6a _start (1, 8047d4c, 0, 8047d50, 8047da3, 8047db7) + 1a
Didn't know that cores were written into /cores by default; Yes, I have the core files there too:
[root@kairoshv /cores]# pstack core.atq* core 'core.atq.22722' of 22722: atq feeee6c5 syscall (feef88a4, 1c, a, 4b, 8047ab8, 8047a9a) + 5 fee7da2f ???????? (8064310, 8047c5c, 0, 8065f0c, 0, 60554910) 08052244 printjobname (8065f02) 080526fa printqueue (0, 0) + 9a 08052bcb main (feeee707, fef68648) + 394 08051f97 _start_crt (1, 8047c58, fefd0117, 0, 0, 0) + 96 08051e6a _start (1, 8047d64, 0, 8047d68, 8047dbb, 8047dcf) + 1a
core 'core.atq.23081' of 23081: atq feeee6c5 syscall (feef88a4, 1c, a, fc, 8047ab8, 8047a9a) + 5 fee7da2f ???????? (8064310, 8047c5c, 0, 8065f0c, 0, 60554910) 08052244 printjobname (8065f02) 080526fa printqueue (0, 0) + 9a 08052bcb main (feeee707, fef68648) + 394 08051f97 _start_crt (1, 8047c58, fefd0117, 0, 0, 0) + 96 08051e6a _start (1, 8047d64, 0, 8047d68, 8047dbb, 8047dcf) + 1a
core 'core.atq.23093' of 23093: atq feeee6c5 syscall (feef88a4, 1c, a, 81, 8047aa8, 8047a8a) + 5 fee7da2f ???????? (8064310, 8047c44, 0, 8065f0c, 0, 60554910) 08052244 printjobname (8065f02) 080526fa printqueue (0, 0) + 9a 08052bcb main (feeee707, fef68648) + 394 08051f97 _start_crt (1, 8047c40, fefd0117, 0, 0, 0) + 96 08051e6a _start (1, 8047d50, 0, 8047d54, 8047da7, 8047dbb) + 1a
core 'core.atq.23096' of 23096: atq feeee6c5 syscall (feef88a4, 1c, a, a7, 8047aa8, 8047a8a) + 5 fee7da2f ???????? (8064310, 8047c44, 0, 8065f0c, 0, 60554910) 08052244 printjobname (8065f02) 080526fa printqueue (0, 0) + 9a 08052bcb main (feeee707, fef68648) + 394 08051f97 _start_crt (1, 8047c40, fefd0117, 0, 0, 0) + 96 08051e6a _start (1, 8047d50, 0, 8047d54, 8047da7, 8047dbb) + 1a
core 'core.atq.23110' of 23110: atq feeee6c5 syscall (feef88a4, 1c, a, 2e, 8047aa8, 8047a8a) + 5 fee7da2f ???????? (8064310, 8047c44, 0, 8065f0c, 0, 60554910) 08052244 printjobname (8065f02) 080526fa printqueue (0, 0) + 9a 08052bcb main (feeee707, fef68648) + 394 08051f97 _start_crt (1, 8047c40, fefd0117, 0, 0, 0) + 96 08051e6a _start (1, 8047d50, 0, 8047d54, 8047da7, 8047dbb) + 1a
core 'core.atq.23133' of 23133: atq feeee6c5 syscall (feef88a4, 1c, a, 8e, 8047aa8, 8047a8a) + 5 fee7da2f ???????? (8064310, 8047c40, 0, 8065f0c, 0, 60554910) 08052244 printjobname (8065f02) 080526fa printqueue (0, 0) + 9a 08052bcb main (feeee707, fef68648) + 394 08051f97 _start_crt (1, 8047c3c, fefd0117, 0, 0, 0) + 96 08051e6a _start (1, 8047d4c, 0, 8047d50, 8047da3, 8047db7) + 1a
Can you try one more thing? mdb -e '::status' /usr/bin/atq core.atq.23387
(if the core file isn't in the current directory, adjust the parameter accordingly).
Or really on any of those core files will probably work.
[root@kairoshv /var/tmp]# mdb -e '::status' /usr/bin/atq core.atq.23387
debugging core file of atq (32-bit) from kairoshv
file: /usr/bin/atq
initial argv: atq
threading model: native threads
status: process panicked
upanic message: *** stack smashing detected
Using that same core file (core.atq.23387 -- the value may be different in the other core files) can you try mdb -e '8065f02/s' /usr/bin/atq core.atq.23387
?
[root@kairoshv /var/tmp]# mdb -e '8065f02/s' /usr/bin/atq core.atq.23387
0x8065f02: 1616202000.a
Ok, can you cat the contents of that file and paste them (assuming nothing sensitive -- please verify first of course). My suspicion is that a line starting with a : jobname:
has an invalid value.
It should be the third line of the file (and really just pasting the first three lines of the file should be enough)
[root@kairoshv /var/tmp]# cat /var/spool/cron/atjobs/1616202000.a
: at job
: jobname: /tmp/scrub
: notify by mail: no
: project: 1
export MANPATH; MANPATH='/usr/share/man:/smartdc/man:/opt/smartdc/man:/opt/local/man:/opt/tools/man'
export TERM; TERM='xterm-256color'
export SHELL; SHELL='/usr/bin/bash'
export CURL_CA_BUNDLE; CURL_CA_BUNDLE='/opt/tools/share/mozilla-rootcerts/cacert.pem'
export SSH_CLIENT; SSH_CLIENT='<REDACTED IP xx.xx.xx.xx> 51204 22'
export SSH_TTY; SSH_TTY='/dev/pts/4'
export USER; USER='root'
export PAGER; PAGER='less'
export MAIL; MAIL='/var/mail/root'
export PATH; PATH='/usr/bin:/usr/sbin:/smartdc/bin:/opt/smartdc/bin:/opt/local/bin:/opt/local/sbin:/opt/tools/bin:/opt/tools/sbin:/opt/smartdc/agents/bin'
export PWD; PWD='/root'
export LANG; LANG='C'
export TZ; TZ='UTC'
export SHLVL; SHLVL='1'
export HOME; HOME='/root'
export LOGNAME; LOGNAME='root'
export SSH_CONNECTION; SSH_CONNECTION='<REDACTED IP xx.xx.xx.xx> 51204 <REDACTED IP xx.xx.xx.xx> 22'
export LC_CTYPE; LC_CTYPE='cs_CZ.UTF-8'
export PROMPT_COMMAND; PROMPT_COMMAND='echo -ne "\033]0;${HOSTNAME} \007" && history -a'
export _; _='/usr/bin/at'
$SHELL << '...the rest of this file is shell input'
#
# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License, Version 1.0 only
# (the "License"). You may not use this file except in compliance
# with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.2 */
cd '/root'
umask 22
#!/bin/sh
zpool scrub zones
[root@kairoshv /var/tmp]#
Only replaced IP addressess with <REDACTED IP xx.xx.xx.xx>
I've found the issue (the stack trace is slightly misleading). I can supply you a fixed binary, a PI with the fix, or if you're content to wait for the next release (next week), I should be able to get the fix into it.
Jason, many thanks for looking into this and fixing it! I can wait for regular release cycle if the fix would go in.
In the meantime, I've checked how do OS zones behave and I can see the same problem. Would these need to be updated too?
[root@kairoshv ~]# vmadm get 3c44a37e-c081-c099-b4f6-8cba638a9094 | json image_uuid
d504c2d2-830d-11ea-9fb5-337396d92f38
[root@kairoshv ~]# imgadm avail | fgrep $(vmadm get 3c44a37e-c081-c099-b4f6-8cba638a9094 | json image_uuid)
d504c2d2-830d-11ea-9fb5-337396d92f38 base-64-trunk 20200420 smartos zone-dataset 2020-04-20
pkgin updated to latest.
/usr are a part of the PI image (and get shared by all the zones), aren't a part of a particular instance image, so updating the platform should resolve it for all the zones.
Understood. Thanks for the explanation and again for the fix. Have a nice weekend.
If you're curious the bug (filed upstream since this impacts every illumos-based distribution) is: https://www.illumos.org/issues/13657, and the fix (which should make it's way into illumos-joyent on Monday as it's been integrated upstream) is: https://github.com/illumos/illumos-gate/commit/1d5eda342f972d3bf9b3bec5dc9d1103beb74cb7
Hello, I'd like to report coredump in atq, when there's job scheduled. It is reproducible and happens after scheduling a job. The job runs fine.
Steps to reproduce:
Expected behavior:
To see actual list of scheduled jobs with ids, to be able to remove them (for example).
Running:
SmartOS 20210311T001742Z (just updated) Have been able to reproduce same behavior on SmartOS 20210114T163038Z too.