oetiker / znapzend

zfs backup with remote capabilities and mbuffer integration.
www.znapzend.org
GNU General Public License v3.0
612 stars 137 forks source link

Znapzend hangs on 2.2.5 in Arch Linux #672

Open sempervictus opened 2 months ago

sempervictus commented 2 months ago

Trying to runonce a freshly configured backup set on a new system with 2.2.5 results in znapzend hanging forever without creating or sending anything.

Ran an strace of the execution and looks like it errors out then just clock_nanosleeps forever:

clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x6f320fc20310) = 2575019
close(9)                                = 0
close(7)                                = 0
read(8, "", 4)                          = 0
close(8)                                = 0
fcntl(6, F_SETFD, FD_CLOEXEC)           = 0
ioctl(6, TCGETS, 0x77196e686f90)        = -1 ENOTTY (Inappropriate ioctl for device)
lseek(6, 0, SEEK_CUR)                   = -1 ESPIPE (Illegal seek)
read(5, "", 8192)                       = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=2575018, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
read(6, "", 8192)                       = 0
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=2575019, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
pipe2([7, 8], O_CLOEXEC)                = 0
pipe2([9, 10], O_CLOEXEC)               = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x6f320fc20310) = 2575020
close(10)                               = 0
close(8)                                = 0
read(9, "", 4)                          = 0
close(9)                                = 0
fcntl(7, F_SETFD, FD_CLOEXEC)           = 0
ioctl(7, TCGETS, 0x77196e686f90)        = -1 ENOTTY (Inappropriate ioctl for device)
lseek(7, 0, SEEK_CUR)                   = -1 ESPIPE (Illegal seek)
read(7, "on\n", 8192)                   = 3
close(7)                                = 0
wait4(2575020, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 2575020
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=2575020, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
close(6)                                = 0
wait4(2575019, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 2575019
close(5)                                = 0
wait4(2575018, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 2575018
getpid()                                = 2574975
sendto(4, "<30>Aug 24 16:56:19 znapzend[257"..., 82, MSG_NOSIGNAL, NULL, 0) = 82
getpid()                                = 2574975
sendto(4, "<30>Aug 24 16:56:19 znapzend[257"..., 78, MSG_NOSIGNAL, NULL, 0) = 78
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
openat(AT_FDCWD, "/dev/urandom", O_RDONLY|O_CLOEXEC) = 5
read(5, "O,\1\376", 4)                  = 4
close(5)                                = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
newfstatat(AT_FDCWD, "/etc/localtime", {st_mode=S_IFREG|0644, st_size=114, ...}, 0) = 0
getpid()                                = 2574975
getpid()                                = 2574975
sendto(4, "<30>Aug 24 16:56:19 znapzend[257"..., 105, MSG_NOSIGNAL, NULL, 0) = 105
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x77196e687290) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=0}, NULL) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x77196e687290) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=0}, NULL) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x77196e687290) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=0}, NULL) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x77196e687290) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=0}, NULL) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x77196e687290) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=0}, NULL) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x77196e687290) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=0}, NULL) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x77196e687290) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=0, tv_nsec=0}, NULL) = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, ^C{tv_sec=0, tv_nsec=8052287}) = ? ERESTART_RESTARTBLOCK (Interrupted by signal)
strace: Process 2574975 detached

System is 6.6.46 with ZFS & SPL built into the kernel binary (vs being a module).

jimklimov commented 2 weeks ago

Just in case, are you sure strace attached to the znapzend (perl) process? Maybe the nanosleeps are about it waiting for I/O from a launched zfs program, or mbuffer etc.?

With --debug it should show what it runs, so you can copy-paste the command into shell and run (or here strace) it.

It may help to run iostat -z 1 and/or zpool iostat 1 along the way - perhaps the first runonce tries to send gigabytes of your backed-up dataset to a remote destination, so while zfs send runs, there is nothing for the script to report?