oxidecomputer / crucible

A storage service.
Mozilla Public License 2.0
174 stars 18 forks source link

Upstairs repair task never finished after a second sled-agent restart #845

Open askfongjojo opened 1 year ago

askfongjojo commented 1 year ago

A live repair was kicked off after I svcadm restarted the sled-agent of gc21 for the instance named "sysbench-mysql-7" on rack2. I saw that a repair task was kicked off. I restarted the sled-agent once more (this time with svcadm disable sled-agent && sleep 60 && svcadm enable sled-agent) afterwards. The instance was stuck in read-only disk mode after that:

[323288.801529] EXT4-fs error (device nvme0n1p1): __ext4_find_entry:1663: inode #41: comm cron: reading directory lblock 0
[323288.804761] systemd-journald[374]: Failed to write entry (9 items, 334 bytes), ignoring: Read-only file system
[323288.810030] systemd-journald[374]: Failed to write entry (9 items, 335 bytes), ignoring: Read-only file system
...

This is the snippet of the propolis/crucible log around the time of the second sled-agent restart:

{"msg":"Live Repair in progress, try again","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:32:55.545223081Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Checking if live repair is needed","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:33:55.549037336Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Upstairs repair task running, trying again later","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:33:55.549099063Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Live Repair in progress, try again","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:33:55.54911334Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Checking if live repair is needed","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:34:55.551798776Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Upstairs repair task running, trying again later","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:34:55.551859041Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Live Repair in progress, try again","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:34:55.551870112Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Checking if live repair is needed","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:35:55.555134705Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Upstairs repair task running, trying again later","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:35:55.555211201Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Live Repair in progress, try again","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:35:55.555230437Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] downstairs disconnected","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:36:22.541313986Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 Gone missing, transition from LiveRepairReady to Faulted","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:22.54136876Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 connection to [fd00:1122:3344:101::b]:19001 closed","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:22.541380162Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] [fd00:1122:3344:101::b]:19001 task reports connection:false","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:22.541391383Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active Faulted","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:22.541400321Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] [fd00:1122:3344:101::b]:19001 task reports offline","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:22.541409548Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] pm_task rx.recv() is None","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:36:22.541418906Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"connect timeout","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:33.543762226Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"connect timeout","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:44.547297232Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"connect timeout","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:55.550433161Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Checking if live repair is needed","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:55.558541578Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"No Live Repair required at this time","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:36:55.558586103Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"connect timeout","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:06.55380007Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"connect timeout","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:17.556959256Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
08d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"connect timeout","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:28.558661803Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"connect timeout","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:39.561163386Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] connecting to [fd00:1122:3344:101::b]:19001","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:40.562745224Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 looper connected","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.629498742Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"looper":"2","component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] Proc runs for [fd00:1122:3344:101::b]:19001 in state Faulted","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.629566471Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] upstairs guest_io_ready=TRUE, promote! session 19ea0aac-3dbf-499a-bf83-3cafb4e875ec","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.629992293Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] downstairs client at [fd00:1122:3344:101::b]:19001 has UUID 7f1da83b-fde2-4931-9955-516e8c301718","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.630608413Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] Got region def RegionDefinition { block_size: 512, extent_size: Block { value: 131072, shift: 9 }, extent_count: 1600, uuid: 7f1da83b-fde2-4931-9955-516e8c301718, encrypted: true, database_read_version: 1, database_write_version: 1 }","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.630656645Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Returning client:2 UUID:7f1da83b-fde2-4931-9955-516e8c301718 matches","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.630669589Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active Faulted","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.630678967Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 (19ea0aac-3dbf-499a-bf83-3cafb4e875ec) Active Active Faulted ds_transition to LiveRepairReady","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.690626117Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] Transition from Faulted to LiveRepairReady","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.690672916Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] new RM replaced this: Some(RegionMetadata { generation: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], flush_numbers: [3363, 3069268, 3066975, 3060130, 1483035, 1359673, 1359673, 164710, 1359672, 3063694, 1359673, 162324, 162391, 162002, 162003, 164710, 164711, 3069115, 3069300, 165401, 15109, 3068368, 3060130, 16846, 17414, 32016, 3068634, 3068056, 3068999, 3069104, 3069198, 3069055, 3068932, 3069275, 3069101, 3004330, 3069161, 3057240, 3069232, 3069278, 3069247, 3068891, 3069220, 3069273, 3069061, 3069300, 3069281, 3068700, 3069277, 3069232, 3069017, 3069274, 3069262, 3069262, 3069277, 3069236, 3069274, 3069258, 3069220, 3069232, 3069258, 3069252, 3069181, 3069224, 3069277, 3068814, 3068089, 3068747, 3069298, 3069211, 3069137, 3069065, 3069152, 3069254, 3069198, 3069125, 3069297, 3069152, 3069229, 3069028, 3069295, 3069144, 3069107, 3068891, 3068978, 3068978, 3068932, 3069055, 3068600, 3069213, 3069258, 3069249, 3069220, 3069278, 3069258, 3069258, 3069278, 3069170, 3068849, 3068997, 3068972, 3068564, 3069274, 3069055, 3069277, 3069277, 3069277, 3069258, 3069258, 3069274, 3069181, 3069163, 3069107, 3069297, 3069295, 3069068, 3069009, 3066457, 3066353, 3066343, 3066539, 3066232, 3066548, 391670, 395893, 428217, 3066258, 3065713, 612971, 3066542, 1459980, 721670, 566830, 1939789, 3063469, 352730, 156085, 156278, 1500389, 3066549, 3062969, 3062885, 3065588, 3065158, 3066053, 3066548, 3064735, 3058773, 3065590, 3064857, 3066049, 3063643, 3004504, 3066539, 3066479, 2909363, 3063731, 3065428, 1688466, 3065701, 3064845, 3066323, 1810242, 2810011, 3065344, 3060219, 3065447, 3060458, 3064052, 3065097, 3065438, 3058854, 3066079, 2520804, 2514856, 3057127, 3066272, 3064864, 2874382, 2955512, 2977937, 3011441, 2992031, 3063799, 3066579, 0, 0, 0, 0, 0, 0, 0, 0, 2991675, 1924, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2010, 2193, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 222, 0, 0, 0, 0, 0, 2260, 2628, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2682, 2900, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3037, 3310, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3433, 3663, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3821, 3984, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4098, 4275, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4327, 4615, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4689, 4865, 0, 0, 0, 0, 222, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4999, 5274, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5398, 5630, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5734, 6006, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6149, 6495, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6635, 6921, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7036, 7316, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 222, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7421, 7642, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7742, 7961, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8048, 8269, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,....
{"msg":"[2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 Enter Ready for LiveRepair mode","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:37:56.69140023Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"} 
{"msg":"[2] Starts cmd_loop","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.691413475Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"[2] [fd00:1122:3344:101::b]:19001 task reports connection:true","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.691424436Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active LiveRepairReady","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.691436419Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Set check for repair","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:56.691447029Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Checking if live repair is needed","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:57.691994986Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"} 
{"msg":"Upstairs repair task running, trying again later","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:37:57.692047285Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}
{"msg":"Live Repair in progress, try again","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:37:57.692057515Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"} 
{"msg":"Checking if live repair is needed","v":0,"name":"propolis-server","level":30,"time":"2023-07-21T21:38:57.696048528Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"} 
{"msg":"Upstairs repair task running, trying again later","v":0,"name":"propolis-server","level":40,"time":"2023-07-21T21:38:57.696155582Z","hostname":"oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1","pid":29224,"component":"crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0"}

The propolis log file can be found in catacomb: /data/staff/dogfood/jul23/propolis-log-gc16-665308d7-1a09-44fa-ae0c-c089cea58aa1.log.1

jordanhendricks commented 1 year ago

This looks a lot like #837?

jordanhendricks commented 1 year ago

A quick glance at the rest of the propolis log indicates the the upstairs was stuck in a live repair loop, as in #837.

Similar to #837, we see that downstairs came and went a couple of times. I see that downstairs 2 was marked as Faulted, then marked as LiveRepairReady aaround 21:22:30:

[2023-07-21T21:22:30.585714549Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 (19ea0aac-3dbf-499a-bf83-3cafb4e875ec) Active Active Offline ds_transition to Faulted
[2023-07-21T21:22:30.58576227Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Transition from Offline to Faulted
[2023-07-21T21:22:33.796080342Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:22:44.79854503Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:22:55.800669855Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:23:06.803722646Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:23:17.80613514Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:23:18.806831457Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] connecting to [fd00:1122:3344:101::b]:19001 (looper=2)
[2023-07-21T21:23:29.947888358Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] connecting to [fd00:1122:3344:101::b]:19001 (looper=2)
[2023-07-21T21:23:30.949651624Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 looper connected (looper=2)
[2023-07-21T21:23:30.949694806Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Proc runs for [fd00:1122:3344:101::b]:19001 in state Faulted
[2023-07-21T21:23:30.950255513Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] upstairs guest_io_ready=TRUE, promote! session 19ea0aac-3dbf-499a-bf83-3cafb4e875ec
[2023-07-21T21:23:30.951003166Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] downstairs client at [fd00:1122:3344:101::b]:19001 has UUID 7f1da83b-fde2-4931-9955-516e8c301718
[2023-07-21T21:23:30.951038994Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Got region def RegionDefinition { block_size: 512, extent_size: Block { value: 131072, shift: 9 }, extent_count: 1600, uuid: 7f1da83b-fde2-4931-9955-516e8c301718, encrypted: true, database_read_version: 1, database_write_version: 1 }
[2023-07-21T21:23:30.951167799Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: Returning client:2 UUID:7f1da83b-fde2-4931-9955-516e8c301718 matches
[2023-07-21T21:23:30.951186324Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active Faulted
[2023-07-21T21:23:31.019004099Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 (19ea0aac-3dbf-499a-bf83-3cafb4e875ec) Active Active Faulted ds_transition to LiveRepairReady
[2023-07-21T21:23:31.019163041Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Transition from Faulted to LiveRepairReady
[2023-07-21T21:23:31.022155925Z]  WARN: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] new RM replaced this: Some(RegionMetadata { gener

And then again around 21:25:53:

[2023-07-21T21:25:53.464566863Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 Gone missing, transition from LiveRepair to Faulted
[2023-07-21T21:25:53.464578465Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 connection to [fd00:1122:3344:101::b]:19001 closed (looper=2)
[2023-07-21T21:25:53.464592472Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] [fd00:1122:3344:101::b]:19001 task reports connection:false
[2023-07-21T21:25:53.464603704Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active Faulted
[2023-07-21T21:25:53.4646124Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] [fd00:1122:3344:101::b]:19001 task reports offline
[2023-07-21T21:25:53.464629483Z]  WARN: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] pm_task rx.recv() is None
[2023-07-21T21:25:54.466611185Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 looper connected (looper=2)
[2023-07-21T21:25:54.466652183Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Proc runs for [fd00:1122:3344:101::b]:19001 in state Faulted
[2023-07-21T21:25:54.466838608Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] upstairs guest_io_ready=TRUE, promote! session 19ea0aac-3dbf-499a-bf83-3cafb4e875ec
[2023-07-21T21:25:54.468762496Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] downstairs client at [fd00:1122:3344:101::b]:19001 has UUID 7f1da83b-fde2-4931-9955-516e8c301718
[2023-07-21T21:25:54.468802853Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Got region def RegionDefinition { block_size: 512, extent_size: Block { value: 131072, shift: 9 }, extent_count: 1600, uuid: 7f1da83b-fde2-4931-9955-516e8c301718, encrypted: true, database_read_version: 1, database_write_version: 1 }
[2023-07-21T21:25:54.468813253Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: Returning client:2 UUID:7f1da83b-fde2-4931-9955-516e8c301718 matches
[2023-07-21T21:25:54.46882238Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active Faulted
[2023-07-21T21:25:54.520112341Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 (19ea0aac-3dbf-499a-bf83-3cafb4e875ec) Active Active Faulted ds_transition to LiveRepairReady
[2023-07-21T21:25:54.520155803Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Transition from Faulted to LiveRepairReady

then at 21:36:22:

[2023-07-21T21:36:22.54136876Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 Gone missing, transition from LiveRepairReady to Fault
ed
[2023-07-21T21:36:22.541380162Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 connection to [fd00:1122:3344:101::b]:19001 closed (l
ooper=2)
[2023-07-21T21:36:22.541391383Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] [fd00:1122:3344:101::b]:19001 task reports connection:false
[2023-07-21T21:36:22.541400321Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active Faulted
[2023-07-21T21:36:22.541409548Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] [fd00:1122:3344:101::b]:19001 task reports offline
[2023-07-21T21:36:22.541418906Z]  WARN: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] pm_task rx.recv() is None
[2023-07-21T21:36:33.543762226Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:36:44.547297232Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:36:55.550433161Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:36:55.558541578Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: Checking if live repair is needed
[2023-07-21T21:36:55.558586103Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: No Live Repair required at this time
[2023-07-21T21:37:06.55380007Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:37:17.556959256Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:37:28.558661803Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:37:39.561163386Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: connect timeout (looper=2)
[2023-07-21T21:37:40.562745224Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] connecting to [fd00:1122:3344:101::b]:19001 (looper=2)
[2023-07-21T21:37:56.629498742Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 looper connected (looper=2)
[2023-07-21T21:37:56.629566471Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Proc runs for [fd00:1122:3344:101::b]:19001 in state Faulted
[2023-07-21T21:37:56.629992293Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] upstairs guest_io_ready=TRUE, promote! session 19ea0aac-3dbf-499a-bf83-3cafb4e875ec
[2023-07-21T21:37:56.630608413Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] downstairs client at [fd00:1122:3344:101::b]:19001 has UUID 7f1da83b-fde2-4931-9955-516e8c301718
[2023-07-21T21:37:56.630656645Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Got region def RegionDefinition { block_size: 512, extent_size: Block { value: 131072, shift: 9 }, extent_count: 1600, uuid: 7f1da83b-fde2-4931-9955-516e8c301718, encrypted: true, database_read_version: 1, database_write_version: 1 }
[2023-07-21T21:37:56.630669589Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: Returning client:2 UUID:7f1da83b-fde2-4931-9955-516e8c301718 matches
[2023-07-21T21:37:56.630678967Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: f8b9438b-14c9-4f54-99ff-10bb014da9b0 Active Active Faulted
[2023-07-21T21:37:56.690626117Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] f8b9438b-14c9-4f54-99ff-10bb014da9b0 (19ea0aac-3dbf-499a-bf83-3cafb4e875ec) Active Active Faulted ds_transition to LiveRepairReady
[2023-07-21T21:37:56.690672916Z]  INFO: propolis-server/crucible-f8b9438b-14c9-4f54-99ff-10bb014da9b0/29224 on oxz_propolis-server_665308d7-1a09-44fa-ae0c-c089cea58aa1: [2] Transition from Faulted to LiveRepairReady
jordanhendricks commented 1 year ago

I think this is indeed the exact same failure as #837. That propolis-server is currently pid 8628 on host 9 in rack2. None of its nvme emulation probes are firing; the guest is only spinning reading and writing from the UART, as with other instances I've looked at with this failure mode:


# dtrace -n propolis8682:propolis-server::
...
118  27455 _ZN8propolis3pio6PioBus9handle_in17hc848d06bc6657097E:pio_in 
118  27493 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_entry 
118  27494 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_exit 
118  27489 _ZN8propolis2hw4uart9uart165504Uart9reg_write17h0068cf248a51f3dcE:uart_reg_write 
118  27456 _ZN8propolis3pio6PioBus10handle_out17h036ab58b729d535dE:pio_out 
118  27493 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_entry 
118  27494 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_exit 
118  27488 _ZN8propolis2hw4uart9uart165504Uart8reg_read17hb02669f628d1ab98E:uart_reg_read 
118  27455 _ZN8propolis3pio6PioBus9handle_in17hc848d06bc6657097E:pio_in 
118  27493 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_entry 
118  27494 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_exit 
118  27488 _ZN8propolis2hw4uart9uart165504Uart8reg_read17hb02669f628d1ab98E:uart_reg_read 
118  27455 _ZN8propolis3pio6PioBus9handle_in17hc848d06bc6657097E:pio_in 
118  27493 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_entry 
118  27494 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_exit 
118  27489 _ZN8propolis2hw4uart9uart165504Uart9reg_write17h0068cf248a51f3dcE:uart_reg_write 
118  27456 _ZN8propolis3pio6PioBus10handle_out17h036ab58b729d535dE:pio_out 
118  27493 _ZN8propolis4vcpu4Vcpu3run17h0972543c82addf41E:vm_entry 

``
jordanhendricks commented 1 year ago

The downstairs addresses are:

The third one is the faulted one. That's the zone oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b on gc 21. I'm not sure how to go from that to the correct downstairs. There are several:

root@oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b:~# svcs | grep downstairs | awk '{print $3}' 
svc:/oxide/crucible/downstairs:downstairs-02851bb6-ca32-417f-bdf2-6fc58a8a5570
svc:/oxide/crucible/downstairs:downstairs-276b6692-4357-4b39-9476-ba4372a4be63
svc:/oxide/crucible/downstairs:downstairs-3c14611c-b0b4-45c8-bd12-1ade1c3dda82
svc:/oxide/crucible/downstairs:downstairs-668d0d58-210f-41a2-8aa2-fb1e96d275fd
svc:/oxide/crucible/downstairs:downstairs-7448b0bd-006e-4afe-ab9a-0888a863f657
svc:/oxide/crucible/downstairs:downstairs-7f1da83b-fde2-4931-9955-516e8c301718
svc:/oxide/crucible/downstairs:downstairs-84818dde-9adf-4ef8-a926-134ca6cc4aa0
svc:/oxide/crucible/downstairs:downstairs-9bc82df2-edcd-464e-a5c1-cf1ccd686878
svc:/oxide/crucible/downstairs:downstairs-9d9f30b9-3cba-4f8b-9fdc-3f556b5b7b3d
svc:/oxide/crucible/downstairs:downstairs-b6c24008-5a8d-4902-9c1e-bc2c9f2c905a
svc:/oxide/crucible/downstairs:downstairs-c60dabc1-ecf0-4414-a5fe-f2eb398a7580
svc:/oxide/crucible/downstairs:downstairs-ef2bf108-1da5-49ec-a6b2-f2f43621e5a7

Since I don't recall at the moment how to map the propolis log to a downstairs. I've grabbed all the downstairs logfiles and put them at /data/staff/dogfood/jul23/crucible-845/:

jordan@catacomb ~ $ ls -lh /data/staff/dogfood/jul23/crucible-845
total 925
-rw-rw-r--+  1 jordan   staff      3.29K Jul 24 00:59 oxide-crucible-downstairs:downstairs-02851bb6-ca32-417f-bdf2-6fc58a8a5570.log
-rw-rw-r--+  1 jordan   staff       180K Jul 24 00:59 oxide-crucible-downstairs:downstairs-276b6692-4357-4b39-9476-ba4372a4be63.log
-rw-rw-r--+  1 jordan   staff      3.26K Jul 24 00:59 oxide-crucible-downstairs:downstairs-3c14611c-b0b4-45c8-bd12-1ade1c3dda82.log
-rw-rw-r--+  1 jordan   staff      3.26K Jul 24 00:59 oxide-crucible-downstairs:downstairs-668d0d58-210f-41a2-8aa2-fb1e96d275fd.log
-rw-rw-r--+  1 jordan   staff       941K Jul 24 00:59 oxide-crucible-downstairs:downstairs-7448b0bd-006e-4afe-ab9a-0888a863f657.log
-rw-rw-r--+  1 jordan   staff      2.00M Jul 24 00:59 oxide-crucible-downstairs:downstairs-7f1da83b-fde2-4931-9955-516e8c301718.log
-rw-rw-r--+  1 jordan   staff       740K Jul 24 00:59 oxide-crucible-downstairs:downstairs-84818dde-9adf-4ef8-a926-134ca6cc4aa0.log
-rw-rw-r--+  1 jordan   staff       482K Jul 24 01:00 oxide-crucible-downstairs:downstairs-9bc82df2-edcd-464e-a5c1-cf1ccd686878.log
-rw-rw-r--+  1 jordan   staff      1020K Jul 24 01:00 oxide-crucible-downstairs:downstairs-9d9f30b9-3cba-4f8b-9fdc-3f556b5b7b3d.log
-rw-rw-r--+  1 jordan   staff      3.26K Jul 24 01:00 oxide-crucible-downstairs:downstairs-b6c24008-5a8d-4902-9c1e-bc2c9f2c905a.log
-rw-rw-r--+  1 jordan   staff      3.26K Jul 24 01:00 oxide-crucible-downstairs:downstairs-c60dabc1-ecf0-4414-a5fe-f2eb398a7580.log
-rw-rw-r--+  1 jordan   staff      2.05M Jul 24 01:00 oxide-crucible-downstairs:downstairs-ef2bf108-1da5-49ec-a6b2-f2f43621e5a7.log
-rw-rw-r--+  1 jordan   staff      1.01M Jul 24 01:00 propolis-log-gc16-665308d7-1a09-44fa-ae0c-c089cea58aa1.log.1

7448b0bd-006e-4afe-ab9a-0888a863f65 seems to have some action.

askfongjojo commented 1 year ago

Thanks for looking into this @jordanhendricks. Yes, 7f1da83b-fde2-4931-9955-516e8c301718 is the one (sorry I should have copied the crucible logs as well). I was able to pinpoint that by grepping the disk id against the crucible log files. There are another 2 log files for 7f1da83b-fde2-4931-9955-516e8c301718 from 2 previous log rotations (.0 and .1). I've copied them to the crucible-845 folder in catacomb just now.

This is how I located the disk uuid in CRDB:

root@[fd00:1122:3344:102::3]:32221/omicron> select dataset.ip,instance.name,active_propolis_id as propolis,volume.id as vol,disk.id as disk
from instance
join disk on disk.attach_instance_id = instance.id
join volume on volume.id = disk.volume_id
join region on region.volume_id = volume.id
join dataset on dataset.id = region.dataset_id where instance.name = 'sysbench-mysql-7';
           ip           |       name        |               propolis               |                 vol                  |                 disk
------------------------+-------------------+--------------------------------------+--------------------------------------+---------------------------------------
  fd00:1122:3344:101::b | sysbench-mysql-7  | 665308d7-1a09-44fa-ae0c-c089cea58aa1 | 13692170-38c7-4e42-ae7f-e9668e9b6c52 | f8b9438b-14c9-4f54-99ff-10bb014da9b0
  fd00:1122:3344:105::c | sysbench-mysql-7  | 665308d7-1a09-44fa-ae0c-c089cea58aa1 | 13692170-38c7-4e42-ae7f-e9668e9b6c52 | f8b9438b-14c9-4f54-99ff-10bb014da9b0
  fd00:1122:3344:107::8 | sysbench-mysql-7  | 665308d7-1a09-44fa-ae0c-c089cea58aa1 | 13692170-38c7-4e42-ae7f-e9668e9b6c52 | f8b9438b-14c9-4f54-99ff-10bb014da9b0
(24 rows)

More context on the sled-agent restart - The crucible downstairs was removed and rebuilt twice after I rebooted sled-agent on gc21. The exact time when the events took place can be found in the sled-agent log lines below:

BRM42220031 # grep aa0307e6-4a0f-470b-a595-5a73f0f2911b /var/svc/log/oxide-sled-agent\:default.log.2 | looker
21:21:45.063Z WARN SledAgent: Deleting existing zone
    zone_name = oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b
21:22:43.565Z INFO SledAgent (BootstrapAgent): Configuring new Omicron zone: oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b
21:22:43.687Z INFO SledAgent (BootstrapAgent): Installing Omicron zone: oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b
21:23:11.504Z INFO SledAgent (BootstrapAgent): Profile for oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b:
    <!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1">
    <service_bundle type="profile" name="omicron">
      <service version="1" type="service" name="oxide/crucible/agent">
        <instance enabled="true" name="default">
          <property_group type="application" name="config">
            <propval type="astring" name="datalink" value="oxControlService4"/>
            <propval type="astring" name="gateway" value="fd00:1122:3344:101::1"/>
            <propval type="astring" name="dataset" value="oxp_fe25d21f-94bf-4275-9a1f-9fae7840570b/crucible"/>
            <propval type="astring" name="listen_addr" value="fd00:1122:3344:101::b"/>
            <propval type="astring" name="listen_port" value="32345"/>
            <propval type="astring" name="uuid" value="f41a06e5-df87-4400-a4ff-16c6cb2eebd6"/>
            <propval type="astring" name="store" value="/data"/>
          </property_group>
        </instance>
      </service>
    </service_bundle>
21:23:11.518Z INFO SledAgent (BootstrapAgent): Zone booting
    zone = oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b
21:36:20.764Z WARN SledAgent: Deleting existing zone
    zone_name = oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b
21:37:11.538Z INFO SledAgent (BootstrapAgent): Configuring new Omicron zone: oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b
21:37:11.649Z INFO SledAgent (BootstrapAgent): Installing Omicron zone: oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b
21:37:38.724Z INFO SledAgent (BootstrapAgent): Profile for oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b:
    <!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1">
    <service_bundle type="profile" name="omicron">
      <service version="1" type="service" name="oxide/crucible/agent">
        <instance enabled="true" name="default">
          <property_group type="application" name="config">
            <propval type="astring" name="datalink" value="oxControlService10"/>
            <propval type="astring" name="gateway" value="fd00:1122:3344:101::1"/>
            <propval type="astring" name="dataset" value="oxp_fe25d21f-94bf-4275-9a1f-9fae7840570b/crucible"/>
            <propval type="astring" name="listen_addr" value="fd00:1122:3344:101::b"/>
            <propval type="astring" name="listen_port" value="32345"/>
            <propval type="astring" name="uuid" value="4c3ae92c-552b-4a3b-997b-ed00fa21f013"/>
            <propval type="astring" name="store" value="/data"/>
          </property_group>
        </instance>
      </service>
    </service_bundle>
21:37:38.811Z INFO SledAgent (BootstrapAgent): Zone booting
    zone = oxz_crucible_aa0307e6-4a0f-470b-a595-5a73f0f2911b

The sled-agent log from gc21 has also been copied to catacomb.

angela@catacomb ~ $ ls -ltr /data/staff/dogfood/jul23/crucible-845/ | tail -4
-rw-rw-r--+  1 jordan   staff    1061290 Jul 24 01:00 propolis-log-gc16-665308d7-1a09-44fa-ae0c-c089cea58aa1.log.1
-rw-rw-r--+  1 angela   staff    2314984 Jul 24 04:56 oxide-crucible-downstairs:downstairs-7f1da83b-fde2-4931-9955-516e8c301718.log.0
-rw-rw-r--+  1 angela   staff    2852743 Jul 24 04:56 oxide-crucible-downstairs:downstairs-7f1da83b-fde2-4931-9955-516e8c301718.log.1
-rw-rw-r--+  1 angela   staff    5153328 Jul 24 05:20 gc21-oxide-sled-agent:default.log.2