hanwen / go-fuse

FUSE bindings for Go
Other
1.97k stars 313 forks source link

deadlock on CI #489

Closed hanwen closed 7 months ago

hanwen commented 9 months ago

GOMAXPROCS=1 ./all.bash hangs:

PC=0x46d201 m=0 sigcode=0

goroutine 0 [idle]:
runtime.futex()
    /usr/lib/golang/src/runtime/sys_linux_amd64.s:554 +0x21 fp=0x7fff10699a68 sp=0x7fff10699a60 pc=0x46d201
runtime.futexsleep(0x460d7d?, 0x739f08?, 0x76ce80?)
    /usr/lib/golang/src/runtime/os_linux.go:69 +0x36 fp=0x7fff10699ab8 sp=0x7fff10699a68 pc=0x433416
runtime.notesleep(0x7857e8)
    /usr/lib/golang/src/runtime/lock_futex.go:160 +0x87 fp=0x7fff10699af0 sp=0x7fff10699ab8 pc=0x40cd67
runtime.mPark(...)
    /usr/lib/golang/src/runtime/proc.go:1531
runtime.stopm()
    /usr/lib/golang/src/runtime/proc.go:2335 +0x8c fp=0x7fff10699b20 sp=0x7fff10699af0 pc=0x43deec
runtime.exitsyscall0(0xc00023d860)
    /usr/lib/golang/src/runtime/proc.go:4140 +0x15a fp=0x7fff10699b50 sp=0x7fff10699b20 pc=0x4423da
runtime.mcall()
    /usr/lib/golang/src/runtime/asm_amd64.s:452 +0x43 fp=0x7fff10699b60 sp=0x7fff10699b50 pc=0x4690c3

goroutine 1 [chan receive]:
runtime.gopark(0xc0003d1400?, 0xc000014290?, 0x40?, 0xe3?, 0xc0000a9a28?)
    /usr/lib/golang/src/runtime/proc.go:381 +0xd6 fp=0xc0000a99a8 sp=0xc0000a9988 pc=0x43a016
runtime.chanrecv(0xc00006c380, 0xc0000a9aa7, 0x1)
    /usr/lib/golang/src/runtime/chan.go:583 +0x49d fp=0xc0000a9a38 sp=0xc0000a99a8 pc=0x407bbd
runtime.chanrecv1(0x784c40?, 0x5c1a40?)
    /usr/lib/golang/src/runtime/chan.go:442 +0x18 fp=0xc0000a9a60 sp=0xc0000a9a38 pc=0x4076f8
testing.(*T).Run(0xc000007d40, {0x602bb9?, 0x4e2965?}, 0x60e2c8)
    /usr/lib/golang/src/testing/testing.go:1630 +0x405 fp=0xc0000a9b20 sp=0xc0000a9a60 pc=0x4e3b05
testing.runTests.func1(0x7856a0?)
    /usr/lib/golang/src/testing/testing.go:2036 +0x45 fp=0xc0000a9b70 sp=0xc0000a9b20 pc=0x4e5c65
testing.tRunner(0xc000007d40, 0xc0000a9c88)
    /usr/lib/golang/src/testing/testing.go:1576 +0x10b fp=0xc0000a9bc0 sp=0xc0000a9b70 pc=0x4e2c4b
testing.runTests(0xc0000900a0?, {0x7814a0, 0x3c, 0x3c}, {0xc000089440?, 0x100c0000a9d10?, 0x784e00?})
    /usr/lib/golang/src/testing/testing.go:2034 +0x489 fp=0xc0000a9cb8 sp=0xc0000a9bc0 pc=0x4e5b49
testing.(*M).Run(0xc0000900a0)
    /usr/lib/golang/src/testing/testing.go:1906 +0x63a fp=0xc0000a9f00 sp=0xc0000a9cb8 pc=0x4e44ba
main.main()
    _testmain.go:165 +0x1aa fp=0xc0000a9f80 sp=0xc0000a9f00 pc=0x5adc6a
runtime.main()
    /usr/lib/golang/src/runtime/proc.go:250 +0x207 fp=0xc0000a9fe0 sp=0xc0000a9f80 pc=0x439be7
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc0000a9fe8 sp=0xc0000a9fe0 pc=0x46b301

goroutine 2 [force gc (idle)]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
    /usr/lib/golang/src/runtime/proc.go:381 +0xd6 fp=0xc00004afb0 sp=0xc00004af90 pc=0x43a016
runtime.goparkunlock(...)
    /usr/lib/golang/src/runtime/proc.go:387
runtime.forcegchelper()
    /usr/lib/golang/src/runtime/proc.go:305 +0xb0 fp=0xc00004afe0 sp=0xc00004afb0 pc=0x439e50
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc00004afe8 sp=0xc00004afe0 pc=0x46b301
created by runtime.init.6
    /usr/lib/golang/src/runtime/proc.go:293 +0x25

goroutine 3 [GC sweep wait]:
runtime.gopark(0x1?, 0x0?, 0x0?, 0x0?, 0x0?)
    /usr/lib/golang/src/runtime/proc.go:381 +0xd6 fp=0xc00004b780 sp=0xc00004b760 pc=0x43a016
runtime.goparkunlock(...)
    /usr/lib/golang/src/runtime/proc.go:387
runtime.bgsweep(0x0?)
    /usr/lib/golang/src/runtime/mgcsweep.go:319 +0xde fp=0xc00004b7c8 sp=0xc00004b780 pc=0x424bde
runtime.gcenable.func1()
    /usr/lib/golang/src/runtime/mgc.go:178 +0x26 fp=0xc00004b7e0 sp=0xc00004b7c8 pc=0x419e46
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc00004b7e8 sp=0xc00004b7e0 pc=0x46b301
created by runtime.gcenable
    /usr/lib/golang/src/runtime/mgc.go:178 +0x6b

goroutine 4 [sleep]:
runtime.gopark(0xc000078000?, 0x4185d8c5c0d0?, 0x0?, 0x0?, 0x60e5b0?)
    /usr/lib/golang/src/runtime/proc.go:381 +0xd6 fp=0xc00004bf30 sp=0xc00004bf10 pc=0x43a016
runtime.goparkunlock(...)
    /usr/lib/golang/src/runtime/proc.go:387
runtime.(*scavengerState).sleep(0x784e80, 0x4114236000000000)
    /usr/lib/golang/src/runtime/mgcscavenge.go:479 +0x125 fp=0xc00004bfa0 sp=0xc00004bf30 pc=0x422cc5
runtime.bgscavenge(0x0?)
    /usr/lib/golang/src/runtime/mgcscavenge.go:637 +0x7f fp=0xc00004bfc8 sp=0xc00004bfa0 pc=0x4230bf
runtime.gcenable.func2()
    /usr/lib/golang/src/runtime/mgc.go:179 +0x26 fp=0xc00004bfe0 sp=0xc00004bfc8 pc=0x419de6
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc00004bfe8 sp=0xc00004bfe0 pc=0x46b301
created by runtime.gcenable
    /usr/lib/golang/src/runtime/mgc.go:179 +0xaa

goroutine 5 [finalizer wait]:
runtime.gopark(0x0?, 0x60e550?, 0xa0?, 0xe1?, 0x2000000020?)
    /usr/lib/golang/src/runtime/proc.go:381 +0xd6 fp=0xc00004a628 sp=0xc00004a608 pc=0x43a016
runtime.runfinq()
    /usr/lib/golang/src/runtime/mfinal.go:193 +0x107 fp=0xc00004a7e0 sp=0xc00004a628 pc=0x418e87
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc00004a7e8 sp=0xc00004a7e0 pc=0x46b301
created by runtime.createfing
    /usr/lib/golang/src/runtime/mfinal.go:163 +0x45

goroutine 385 [running]:
    goroutine running on other thread; stack unavailable
created by testing.(*T).Run
    /usr/lib/golang/src/testing/testing.go:1629 +0x3ea

goroutine 386 [runnable]:
syscall.Syscall(0x471dee?, 0xc0000c6cf8?, 0xc000059d78?, 0x5bfa60?)
    /usr/lib/golang/src/syscall/syscall_linux.go:69 +0x27 fp=0xc000059d98 sp=0xc000059d28 pc=0x47baa7
syscall.read(0xc0000c6cf8?, {0xc00057e1b0?, 0xc000639308?, 0xc000059ea0?})
    /usr/lib/golang/src/syscall/zsyscall_linux_amd64.go:711 +0x45 fp=0xc000059dd8 sp=0xc000059d98 pc=0x479cc5
syscall.Read(...)
    /usr/lib/golang/src/syscall/syscall_unix.go:178
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest.func1(...)
    /home/hanwen/vc/go-fuse/fuse/server.go:334
github.com/hanwen/go-fuse/v2/fuse.handleEINTR(...)
    /home/hanwen/vc/go-fuse/fuse/server.go:309
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest(0xc0000c6c60, 0x0?)
    /home/hanwen/vc/go-fuse/fuse/server.go:332 +0x1c5 fp=0xc000059eb0 sp=0xc000059dd8 pc=0x53a6c5
github.com/hanwen/go-fuse/v2/fuse.(*Server).loop(0xc0000c6c60, 0x90?)
    /home/hanwen/vc/go-fuse/fuse/server.go:477 +0x72 fp=0xc000059f28 sp=0xc000059eb0 pc=0x53b472
github.com/hanwen/go-fuse/v2/fuse.(*Server).Serve(0xc0000c6c60)
    /home/hanwen/vc/go-fuse/fuse/server.go:421 +0x35 fp=0xc000059fc8 sp=0xc000059f28 pc=0x53b175
github.com/hanwen/go-fuse/v2/fuse/test.NewTestCase.func1()
    /home/hanwen/vc/go-fuse/fuse/test/loopback_test.go:115 +0x26 fp=0xc000059fe0 sp=0xc000059fc8 pc=0x571446
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc000059fe8 sp=0xc000059fe0 pc=0x46b301
created by github.com/hanwen/go-fuse/v2/fuse/test.NewTestCase
    /home/hanwen/vc/go-fuse/fuse/test/loopback_test.go:115 +0x70a

goroutine 326 [GC worker (idle)]:
runtime.gopark(0x41859d212bd6?, 0xc000223fd0?, 0xd?, 0x72?, 0xc0003789c0?)
    /usr/lib/golang/src/runtime/proc.go:381 +0xd6 fp=0xc000223f50 sp=0xc000223f30 pc=0x43a016
runtime.gcBgMarkWorker()
    /usr/lib/golang/src/runtime/mgc.go:1275 +0xf1 fp=0xc000223fe0 sp=0xc000223f50 pc=0x41bbb1
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc000223fe8 sp=0xc000223fe0 pc=0x46b301
created by runtime.gcBgMarkStartWorkers
    /usr/lib/golang/src/runtime/mgc.go:1199 +0x25

goroutine 394 [runnable]:
syscall.Syscall(0x471dee?, 0xc0000c6cf8?, 0xc0007214a0?, 0x5bfa60?)
    /usr/lib/golang/src/syscall/syscall_linux.go:69 +0x27 fp=0xc00005ee30 sp=0xc00005edc0 pc=0x47baa7
syscall.read(0xc0000c6cf8?, {0xc00064a1b0?, 0xc000639548?, 0xc00005ef38?})
    /usr/lib/golang/src/syscall/zsyscall_linux_amd64.go:711 +0x45 fp=0xc00005ee70 sp=0xc00005ee30 pc=0x479cc5
syscall.Read(...)
    /usr/lib/golang/src/syscall/syscall_unix.go:178
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest.func1(...)
    /home/hanwen/vc/go-fuse/fuse/server.go:334
github.com/hanwen/go-fuse/v2/fuse.handleEINTR(...)
    /home/hanwen/vc/go-fuse/fuse/server.go:309
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest(0xc0000c6c60, 0x40?)
    /home/hanwen/vc/go-fuse/fuse/server.go:332 +0x1c5 fp=0xc00005ef48 sp=0xc00005ee70 pc=0x53a6c5
github.com/hanwen/go-fuse/v2/fuse.(*Server).loop(0xc0000c6c60, 0xb8?)
    /home/hanwen/vc/go-fuse/fuse/server.go:477 +0x72 fp=0xc00005efc0 sp=0xc00005ef48 pc=0x53b472
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest.func3()
    /home/hanwen/vc/go-fuse/fuse/server.go:366 +0x2b fp=0xc00005efe0 sp=0xc00005efc0 pc=0x53abeb
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc00005efe8 sp=0xc00005efe0 pc=0x46b301
created by github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest
    /home/hanwen/vc/go-fuse/fuse/server.go:366 +0x545

goroutine 387 [syscall]:
syscall.Syscall(0x471dee?, 0xc0000c6cf8?, 0xc0007214e8?, 0x5bfa60?)
    /usr/lib/golang/src/syscall/syscall_linux.go:69 +0x27 fp=0xc00005fe30 sp=0xc00005fdc0 pc=0x47baa7
syscall.read(0xc0000c6cf8?, {0xc00066c1b0?, 0xc000639788?, 0xc00005ff38?})
    /usr/lib/golang/src/syscall/zsyscall_linux_amd64.go:711 +0x45 fp=0xc00005fe70 sp=0xc00005fe30 pc=0x479cc5
syscall.Read(...)
    /usr/lib/golang/src/syscall/syscall_unix.go:178
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest.func1(...)
    /home/hanwen/vc/go-fuse/fuse/server.go:334
github.com/hanwen/go-fuse/v2/fuse.handleEINTR(...)
    /home/hanwen/vc/go-fuse/fuse/server.go:309
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest(0xc0000c6c60, 0x80?)
    /home/hanwen/vc/go-fuse/fuse/server.go:332 +0x1c5 fp=0xc00005ff48 sp=0xc00005fe70 pc=0x53a6c5
github.com/hanwen/go-fuse/v2/fuse.(*Server).loop(0xc0000c6c60, 0xc8?)
    /home/hanwen/vc/go-fuse/fuse/server.go:477 +0x72 fp=0xc00005ffc0 sp=0xc00005ff48 pc=0x53b472
github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest.func3()
    /home/hanwen/vc/go-fuse/fuse/server.go:366 +0x2b fp=0xc00005ffe0 sp=0xc00005ffc0 pc=0x53abeb
runtime.goexit()
    /usr/lib/golang/src/runtime/asm_amd64.s:1598 +0x1 fp=0xc00005ffe8 sp=0xc00005ffe0 pc=0x46b301
created by github.com/hanwen/go-fuse/v2/fuse.(*Server).readRequest
    /home/hanwen/vc/go-fuse/fuse/server.go:366 +0x545

rax    0xca
rbx    0x0
rcx    0x46d203
rdx    0x0
rdi    0x7857e8
rsi    0x80
rbp    0x7fff10699aa8
rsp    0x7fff10699a60
r8     0x0
r9     0x0
r10    0x0
r11    0x286
r12    0x442280
r13    0x20
r14    0x7850c0
r15    0x100000
rip    0x46d201
rflags 0x286
cs     0x33
fs     0x0
gs     0x0
FAIL    github.com/hanwen/go-fuse/v2/fuse/test  56.620s
ok      github.com/hanwen/go-fuse/v2/internal   0.006s
?       github.com/hanwen/go-fuse/v2/internal/testutil  [no test files]
?       github.com/hanwen/go-fuse/v2/internal/utimens   [no test files]
ok      github.com/hanwen/go-fuse/v2/newunionfs 0.285s
?       github.com/hanwen/go-fuse/v2/posixtest  [no test files]
ok      github.com/hanwen/go-fuse/v2/splice 0.008s
ok      github.com/hanwen/go-fuse/v2/zipfs  0.038s
FAIL
hanwen commented 9 months ago

=== RUN   TestFlockExclusive
12:24:24.437247 callFusermount: executing ["/usr/bin/fusermount3" "/tmp/TestFlockExclusive3375078584/001/mnt" "-o" "subtype=pathfs.pathInode,max_read=131072"]
12:24:24.442108 rx 2: INIT n0 {7.38 Ra 131072 ASYNC_READ,POSIX_LOCKS,ATOMIC_O_TRUNC,EXPORT_SUPPORT,BIG_WRITES,DONT_MASK,SPLICE_WRITE,SPLICE_MOVE,SPLICE_READ,FLOCK_LOCKS,IOCTL_DIR,AUTO_INVAL_DATA,READDIRPLUS,READDIRPLUS_AUTO,ASYNC_DIO,WRITEBACK_CACHE,NO_OPEN_SUPPORT,PARALLEL_DIROPS,HANDLE_KILLPRIV,POSIX_ACL,ABORT_ERROR,MAX_PAGES,CACHE_SYMLINKS,NO_OPENDIR_SUPPORT,EXPLICIT_INVAL_DATA,0x70000000} "\x0f\x00\x00\x00\x00\x00\x00\x00"... 48b
12:24:24.442153 tx 2:     OK, {7.28 Ra 131072 ASYNC_READ,BIG_WRITES,AUTO_INVAL_DATA,READDIRPLUS,NO_OPEN_SUPPORT,PARALLEL_DIROPS,MAX_PAGES 0/0 Wr 131072 Tg 0 MaxPages 32}
12:24:24.442466 rx 4: LOOKUP n1 [".go-fuse-epoll-hack"] 20b
12:24:24.442540 tx 4:     OK, {n18446744073709551615 g0 tE=0s tA=0s {M0100644 SZ=0 L=1 0:0 B0*0 i0:18446744073709551615 A 0.000000 M 0.000000 C 0.000000}}
12:24:24.442641 rx 6: OPEN n18446744073709551615 {LARGEFILE} 
12:24:24.442821 tx 6:     OK, {Fh 18446744073709551615 }
12:24:24.442903 rx 8: POLL n18446744073709551615 
12:24:24.442929 tx 8:     38=function not implemented
12:24:24.442991 rx 10: FLUSH n18446744073709551615 {Fh 18446744073709551615} 
12:24:24.443013 tx 10:     OK
12:24:24.443223 rx 12: RELEASE n18446744073709551615 {Fh 18446744073709551615 LARGEFILE  L0} 
12:24:24.443245 tx 12:     OK
12:24:24.443282 rx 14: LOOKUP n1 ["hello.txt"] 10b
12:24:24.443345 tx 14:     OK, {n3 g2 tE=0.1s tA=0.1s {M0100700 SZ=3 L=1 1000:1000 B8*4096 i0:11337 A 1695551064.442542 M 1695551064.442542 C 1695551064.442542}}
12:24:24.443429 rx 16: OPEN n3 {WRONLY,LARGEFILE} 
12:24:24.443507 tx 16:     OK, {Fh 2 }
12:24:24.443701 rx 18: OPEN n3 {WRONLY,LARGEFILE} 
12:24:24.443803 tx 18:     OK, {Fh 3 }
(hung)
hanwen commented 9 months ago

it hangs on executing the external flock command. The flock is not the in ps -ef output. The mount doesn't respond to anything, even though there are many goroutines blocked on reading from the fuse device.

hanwen commented 7 months ago

Further investigation:

strace dump:

[pid 87039] clone(child_stack=NULL, flags=CLONE_VM|CLONE_VFORK|SIGCHLD <unfinished ...>
strace: Process 87046 attached
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGHUP, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] rt_sigaction(SIGINT, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87046] rt_sigaction(SIGQUIT, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGILL, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] rt_sigaction(SIGTRAP, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87046] rt_sigaction(SIGABRT, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGBUS, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] rt_sigaction(SIGFPE, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87046] rt_sigaction(SIGUSR1, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGSEGV, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGUSR2, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGPIPE, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGALRM, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGTERM, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGSTKFLT, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGCHLD, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGURG, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGXCPU, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGXFSZ, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGVTALRM, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGPROF, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGWINCH, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGIO, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGPWR, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGSYS, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_3, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_4, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_5, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_6, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_7, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_8, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_9, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_10, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_11, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_12, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_13, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_14, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_15, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_16, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_17, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_18, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_19, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_20, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_21, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_22, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_23, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_24, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_25, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_26, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_27, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_28, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_29, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_30, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0},  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigaction resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] rt_sigaction(SIGRT_31, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigaction(SIGRT_32, {sa_handler=SIG_DFL, sa_mask=~[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x46d0a0}, NULL, 8) = 0
[pid 87046] rt_sigprocmask(SIG_SETMASK, [],  <unfinished ...>
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] <... rt_sigprocmask resumed>NULL, 8) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87046] dup3(11, 0, 0)              = 0
[pid 87046] dup3(13, 1, 0)              = 1
[pid 87046] dup3(13, 2, 0)              = 2
[pid 87037] <... nanosleep resumed>NULL) = 0
[pid 87046] dup3(9, 3, 0 <unfinished ...>
[pid 87037] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 87044] <... read resumed>"@\0\0\0\31\0\0\0\24\0\0\0\0\0\0\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 131200) = 64
...
[pid 87037] nanosleep({tv_sec=0, tv_nsec=640000}, NULL) = 0
[pid 87037] epoll_pwait(4, [{events=EPOLLOUT, data={u32=3789921368, u64=139800680438872}}], 128, 0, NULL, 0) = 1
[pid 87037] getpid()                    = 87036
[pid 87037] tgkill(87036, 87039, SIGURG) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=1280000}, NULL) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=2560000}, NULL) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=5120000}, NULL) = 0
[pid 87037] epoll_pwait(4, [], 128, 0, NULL, 0) = 0
[pid 87037] nanosleep({tv_sec=0, tv_nsec=10000000}, NULL) = 0
..
(epoll/nanosleep repeats.)

successful run:

GOMAXPROCS=2 ... 
tick 2023-11-08 15:19:32.447870732 +0100 CET m=+0.808127074
tick 2023-11-08 15:19:32.547355177 +0100 CET m=+0.907611530
tick 2023-11-08 15:19:32.647782274 +0100 CET m=+1.008038615
15:19:32.647959 cmd /does/not/exist --exclusive --nonblock 3
15:19:32.648588 rx 20: FLUSH n3 {Fh 2} 
15:19:32.648627 tx 20:     OK
15:19:32.649139 rx 22: FLUSH n3 {Fh 3} 
15:19:32.649185 tx 22:     OK
15:19:32.649262 rx 24: FLUSH n3 {Fh 3} 
15:19:32.649285 tx 24:     OK
15:19:32.649494 start err fork/exec /does/not/exist: no such file or directory

strace of successful run:

[pid 87474] dup3(9, 3, 0 <unfinished ...>
[pid 87467] <... read resumed>"@\0\0\0\31\0\0\0\24\0\0\0\0\0\0\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 131200) = 64
[pid 87467] write(2, "15:22:54.675266 rx 20: FLUSH n3 "..., 4015:22:54.675266 rx 20: FLUSH n3 {Fh 2} 
 <unfinished ...>

looks like

1) dup3() results in a FUSE call for FLUSH opcode. 2) in the hanging case, there is no OS thread anymore to run any of the FUSE readers.

in the hanging case, we can see:

[pid 87044] <... read resumed>"@\0\0\0\31\0\0\0\24\0\0\0\0\0\0\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 131200) = 64
...

oct(\31) is 25 , ie. the FLUSH opcode.

ie, the read syscall is executed, but we never see rx FLUSH.

hanwen commented 7 months ago

FLUSH is normally due to a close() call, not dup(), however, these dup3() calls are overwriting CLOSE_ON_EXEC fds?

hanwen commented 7 months ago

Removing the flock in the caller retains the hang.

hanwen commented 7 months ago

Here is what happens:

f1, .. := open()
// fd=3
f2, .. := open()
// fd=9

// setup subprocess:
fork()
if child {
  // Inherit fd 9 as fd 3.
  dup3(9, 3, 0x0);
}

dup3(9, 3, 0x0) closes the existing fd 3 and then moves fd 9 to the freed up fd 3. The existing fd 3 is FUSE backed, so closing it calls back into us with a FLUSH call. However, the golang runtime doesn't consider that dup3() can block (AFAICT). So we have 3 threads with work to do:

  1. the parent (waiting for the fork setup to complete successfully)
  2. the child busy calling dup3
  3. the FUSE handler, reading from the fuse device.

With GOMAXPROCS=1, we only have one thread which is busy in 1. & 2.

The dup results in a close, which causes the kernel to fill a buffer for the FLUSH opcode for the fuse device. From the syscall perspective, the kernel completes the read() calls (hence it appears in strace output), but the golang runtime will not schedule the corresponding goroutine to run, because there are no slots available.

hanwen commented 7 months ago

ideas:

The latter seems theoretically possible, but not sure if it is practically possible. This would be a change in the linux VFS layer (not the FUSE layer).

rfjakob commented 7 months ago

I didn't get where the number 3 comes from

On Wed, 8 Nov 2023, 18:32 Han-Wen Nienhuys, @.***> wrote:

Closed #489 https://github.com/hanwen/go-fuse/issues/489 as completed via 27a473d https://github.com/hanwen/go-fuse/commit/27a473d50345f124054dd9cbca44562f68ce15da .

— Reply to this email directly, view it on GitHub https://github.com/hanwen/go-fuse/issues/489#event-10903893105, or unsubscribe https://github.com/notifications/unsubscribe-auth/AACGA73M3RA3Y7LQAXLWAO3YDO64HAVCNFSM6AAAAAA5E2YKJOVHI2DSMVQWIX3LMV45UABCJFZXG5LFIV3GK3TUJZXXI2LGNFRWC5DJN5XDWMJQHEYDGOBZGMYTANI . You are receiving this because you are subscribed to this thread.Message ID: @.***>

rfjakob commented 7 months ago

Ah, now I do. Fds 0, 1, 2, 3 get dup'ed over. 0, 1, 2 are always dup'ed over when forking, and 3 is from cmd.ExtraFiles. And 3 is possibly a FUSE-backed fd.

rfjakob commented 7 months ago

How about this instead:

diff --git a/fuse/test/file_lock_test.go b/fuse/test/file_lock_test.go
index 398f21c..7cbbe62 100644
--- a/fuse/test/file_lock_test.go
+++ b/fuse/test/file_lock_test.go
@@ -70,9 +70,9 @@ func runExternalFlock(flockPath, fname string) ([]byte, error) {
        // in order to test the lock property we must use cmd.ExtraFiles (instead of passing the actual file)
        // if we were to pass the file then this flock command would fail to place the lock (returning a
        // 'file busy' error) as it is already opened and locked at this point (see above)
-       cmd := exec.Command(flockPath, "--exclusive", "--nonblock", "3")
+       cmd := exec.Command(flockPath, "--exclusive", "--nonblock", "1")
        cmd.Env = append(cmd.Env, "LC_ALL=C") // in case the user's shell language is different
-       cmd.ExtraFiles = []*os.File{f}
+       cmd.Stdout = f
        return cmd.CombinedOutput()
 }
hanwen commented 7 months ago

that would probably work too, but also needs a comment. Also, if the test runner does tricks with redirecting stdout, it's possible that fd 1 is actually available and could get assigned to for an open.