Open erneestoc opened 10 months ago
Hi, could you share the flags/config file you're running bazel-remote with? Is this the entire crash output?
I hit this as well on OSX ARM64
./bazel-remote --dir /tmp/bazel --max_size 5
I can't post the full logs because they're 45M. The stacktrace contains the stacktrace of 32k goroutines. Logs before the crash...
2024/01/06 00:44:27 GRPC CAS HEAD 8eb1f3322cd07a04f7f6e6b2d3e9bf53f7c2611a7649748d35bd7adb36f89bdc OK
2024/01/06 00:44:27 GRPC CAS HEAD 990fb33d8ec4225cf16e6cb414a8da1ff8ac17bb8348ee22d37ab797be4922d5 OK
2024/01/06 00:44:27 GRPC CAS HEAD dfa4599da6b9c68c4cdb311798adb46476440585242069db7942b206cd77a6b3 OK
2024/01/06 00:44:27 GRPC CAS HEAD bb3fcf3f6fad09713f161687ff7ed1f8f289c8142554f1e0214486921af4b595 OK
2024/01/06 00:44:27 GRPC CAS HEAD d94f2fc2f752db237950f180c4a707abb8c7c05ca68077c219b34dca0e67b211 OK
2024/01/06 00:44:28 GRPC BYTESTREAM WRITE COMPLETED: uploads/f0ccdb56-2e3d-4f8e-b03a-be505a6ea252/blobs/aac7cfdddf61c7b7ae93c283cfdc81a229f4ccb43f98c39d20ab3f7a50b6b34c/2753
runtime/cgo: pthread_create failed: Resource temporarily unavailable
Some example traces are:
goroutine 1 [semacquire, 1 minutes]:
runtime.gopark(0x101ef6b00?, 0x100d1ceac?, 0x40?, 0x8e?, 0x18?)
GOROOT/src/runtime/proc.go:381 +0xe4 fp=0x14000e2f970 sp=0x14000e2f950 pc=0x100d4a264
runtime.goparkunlock(...)
GOROOT/src/runtime/proc.go:387
runtime.semacquire1(0x14000dae850, 0x9c?, 0x1, 0x0, 0x40?)
GOROOT/src/runtime/sema.go:160 +0x21c fp=0x14000e2f9d0 sp=0x14000e2f970 pc=0x100d5c3cc
sync.runtime_Semacquire(0x1400039fa48?)
GOROOT/src/runtime/sema.go:62 +0x2c fp=0x14000e2fa10 sp=0x14000e2f9d0 pc=0x100d79abc
sync.(*WaitGroup).Wait(0x14000dae848)
GOROOT/src/sync/waitgroup.go:116 +0x78 fp=0x14000e2fa30 sp=0x14000e2fa10 pc=0x100d8a6c8
golang.org/x/sync/errgroup.(*Group).Wait(0x14000dae840)
external/org_golang_x_sync/errgroup/errgroup.go:53 +0x2c fp=0x14000e2fa50 sp=0x14000e2fa30 pc=0x10123f9bc
main.run(0x14000129880)
main.go:223 +0xba8 fp=0x14000e2fc30 sp=0x14000e2fa50 pc=0x1014d5e68
github.com/urfave/cli/v2.(*App).RunContext(0x1400016ea80, {0x101916820?, 0x14000196010}, {0x140001ac000, 0x5, 0x5})
external/com_github_urfave_cli_v2/app.go:395 +0xc04 fp=0x14000e2ff00 sp=0x14000e2fc30 pc=0x10143b1d4
github.com/urfave/cli/v2.(*App).Run(...)
external/com_github_urfave_cli_v2/app.go:252
main.main()
main.go:55 +0x138 fp=0x14000e2ff70 sp=0x14000e2ff00 pc=0x1014d5258
runtime.main()
GOROOT/src/runtime/proc.go:250 +0x248 fp=0x14000e2ffd0 sp=0x14000e2ff70 pc=0x100d49e38
runtime.goexit()
src/runtime/asm_arm64.s:1172 +0x4 fp=0x14000e2ffd0 sp=0x14000e2ffd0 pc=0x100d7e4c4
goroutine 13028 [chan receive]:
runtime.gopark(0x101787760?, 0x1400a98da40?, 0x60?, 0xa4?, 0x100de79f8?)
GOROOT/src/runtime/proc.go:381 +0xe4 fp=0x1400a98d7a0 sp=0x1400a98d780 pc=0x100d4a264
runtime.chanrecv(0x1400a74cf60, 0x1400a98da50, 0x1)
GOROOT/src/runtime/chan.go:583 +0x45c fp=0x1400a98d830 sp=0x1400a98d7a0 pc=0x100d1623c
runtime.chanrecv1(0x1400a74cf00?, 0x0?)
GOROOT/src/runtime/chan.go:442 +0x14 fp=0x1400a98d860 sp=0x1400a98d830 pc=0x100d15da4
github.com/buchgr/bazel-remote/v2/server.(*grpcServer).Write(0x14000dae940, {0x10191ae10?, 0x1400a76a430})
server/grpc_bytestream.go:564 +0x4fc fp=0x1400a98dac0 sp=0x1400a98d860 pc=0x1014ab9ec
google.golang.org/genproto/googleapis/bytestream._ByteStream_Write_Handler({0x1018d3fa0?, 0x14000dae940}, {0x101918d68?, 0x1400a2f9770})
bazel-out/darwin_arm64-fastbuild-ST-eccb913b7463/bin/external/go_googleapis/google/bytestream/bytestream_go_proto_/google.golang.org/genproto/googleapis/bytestream/bytestream.pb.go:709 +0x98 fp=0x1400a98db00 sp=0x1400a98dac0 pc=0x101495338
google.golang.org/grpc.(*Server).processStreamingRPC(0x140001583c0, {0x10191b4f8, 0x140107f3d40}, 0x1400a768480, 0x14000dad0e0, 0x101edc220, 0x0)
external/org_golang_google_grpc/server.go:1631 +0x1000 fp=0x1400a98de20 sp=0x1400a98db00 pc=0x1011fd650
google.golang.org/grpc.(*Server).handleStream(0x140001583c0, {0x10191b4f8, 0x140107f3d40}, 0x1400a768480, 0x0)
external/org_golang_google_grpc/server.go:1718 +0x7e4 fp=0x1400a98df50 sp=0x1400a98de20 pc=0x1011feb74
google.golang.org/grpc.(*Server).serveStreams.func1.1()
external/org_golang_google_grpc/server.go:959 +0x84 fp=0x1400a98dfd0 sp=0x1400a98df50 pc=0x1011f8544
runtime.goexit()
src/runtime/asm_arm64.s:1172 +0x4 fp=0x1400a98dfd0 sp=0x1400a98dfd0 pc=0x100d7e4c4
created by google.golang.org/grpc.(*Server).serveStreams.func1
external/org_golang_google_grpc/server.go:957 +0x164
And
goroutine 16113 [select]:
runtime.gopark(0x14004aa8f60?, 0x4?, 0x48?, 0x8d?, 0x14004aa8ea8?)
GOROOT/src/runtime/proc.go:381 +0xe4 fp=0x14004aa8d00 sp=0x14004aa8ce0 pc=0x100d4a264
runtime.selectgo(0x14004aa8f60, 0x14004aa8ea0, 0x140012d8d20?, 0x0, 0x0?, 0x1)
GOROOT/src/runtime/select.go:327 +0x690 fp=0x14004aa8e20 sp=0x14004aa8d00 pc=0x100d5b4a0
google.golang.org/grpc/internal/transport.(*http2Server).keepalive(0x1400049e9c0)
external/org_golang_google_grpc/internal/transport/http2_server.go:1155 +0x188 fp=0x14004aa8fb0 sp=0x14004aa8e20 pc=0x101199618
google.golang.org/grpc/internal/transport.NewServerTransport.func4()
external/org_golang_google_grpc/internal/transport/http2_server.go:344 +0x28 fp=0x14004aa8fd0 sp=0x14004aa8fb0 pc=0x101192be8
runtime.goexit()
src/runtime/asm_arm64.s:1172 +0x4 fp=0x14004aa8fd0 sp=0x14004aa8fd0 pc=0x100d7e4c4
created by google.golang.org/grpc/internal/transport.NewServerTransport
external/org_golang_google_grpc/internal/transport/http2_server.go:344 +0x1528
@DolceTriade: thanks for the extra details. 32k goroutines sounds quite high for a 5G cache, what kind of hardware are you using? Do you have a lot of users/a lot of incoming requests?
I'm running this solo on my regular M1 Mac laptop for testing purposes. I'm basically just building my company's bazel repository.
After investigating, I think it uploads a lot of Python files very quickly which causes this huge spike in uploads. I haven't tried this yet, but I suspect that if I rate limit uploads using --remote_max_connections=10
on bazel, it might mitigate the issue (it works with other remote caches I've tried). I've crashed with other remote caches too building our repository and I suspect that this is the reason. Bazel's default is 100 max remote connections which doesn't seem that much, but apparently bazel can multiplex each connection for up to 100 uploads, so with 10k simultaneous uploads, it seems feasible that we can hit this situation.
I can report that passing in --remote_max_connections=10
does indeed mitigate the problem. Ideally bazel-remote should throttle responses and not crash for people who accidentally forget to set the flag.
My testing command was:
bazel test -c opt --bes_backend=grpc://localhost:50332 --bes_results_url="http://localhost:3000/invocation/" --remote_cache=grpc://localhost:9092 --test_env=GO_TEST_WRAP_TESTV=1 --remote_max_connections=10 -- //...
Running into this issue consistently on macOS. Is there some configuration I should set to make it work correctly? I'm not seeing this error with the docker image.