Open leftwo opened 2 weeks ago
In dsc/src/main.rs, we have the following code that starts a downstairs:
fn start(&self) -> Result<Child> {
println!("Make output file at {:?}", self.output_file);
let outputs = File::create(&self.output_file)
.context("Failed to create output file")?;
let errors = outputs.try_clone()?;
let port_value = format!("{}", self.port);
let mode = if self.read_only {
"ro".to_string()
} else {
"rw".to_string()
};
let region_dir = self.region_dir.clone();
let cmd = Command::new(self.ds_bin.clone())
.args([
"run",
"-p",
&port_value,
"-d",
®ion_dir,
"--mode",
&mode,
])
.stdout(Stdio::from(outputs))
.stderr(Stdio::from(errors))
.spawn()
.context("Failed trying to run downstairs")?;
println!(
"Downstairs {} port {} PID:{:?}",
region_dir,
self.port,
cmd.id()
);
In our logs above, we see the first message Make outfile at...
for this client, but we never see the 2nd, Downstairs {} port {}...
. So, that command (which does spawn, we see the output file) never comes back from that spawn, as near as I can determine.
I was able to reproduce it outside of CI using a debug build.
The stacks look mostly similar, but this one stuck out:
fffff9ffeeb7bf8a read (20, fffff9ffec3fbc00, 8)
0000000003add616 std::sys::pal::unix::process::process_inner::<impl std::sys::pal::unix::process::process_common::Command>::spawn::h356b47fec30ee6a3 () + 356
0000000003ace1ec std::process::Command::spawn::hb5ff4e4d3aae5893 () + 1c
00000000036eb319 tokio::process::imp::spawn_child::h2e091d8168139f18 () + 29
00000000036c3ea0 tokio::process::Command::spawn::h82818d972f5bd22b () + 30
0000000001ebd1ff dsc::DownstairsInfo::start::hdb9e8fbe2e7641c0 () + 4af
0000000001fc382a dsc::start_ds::{{closure}}::h72fb5d778549a741 () + 39a
0000000001fc3d27 dsc::ds_start_monitor::{{closure}}::h79d20fed6577f95e () + 1a7
0000000001fc340b dsc::start_dsc::{{closure}}::{{closure}}::h4710d443c5725ecc () + db
0000000001ee5cde tokio::runtime::task::core::Core<T,S>::poll::{{closure}}::h25306ec96d735a5a () + 6e
0000000001ee4d6c tokio::runtime::task::core::Core<T,S>::poll::h323db97a5faf7906 () + 2c
0000000001f23dcf tokio::runtime::task::harness::poll_future::{{closure}}::h4932b9dce84b5721 () + 3f
0000000001e8f172 <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::hcffcc65ef2a830a9 () + 22
0000000001f99be6 std::panicking::try::do_call::h791a875498f9620a () + 36
0000000001f94064 std::panicking::try::h1ebe6082f4923d75 () + 44
0000000001e901e4 std::panic::catch_unwind::h280a3ab6527a6078 () + 14
0000000001f22563 tokio::runtime::task::harness::poll_future::hcb4b3af1b5128951 () + 73
0000000001f251d6 tokio::runtime::task::harness::Harness<T,S>::poll_inner::h1765a596355314e0 () + 86
0000000001f2b3e5 tokio::runtime::task::harness::Harness<T,S>::poll::h9cb104db5087b249 () + 15
0000000001ecbc2d tokio::runtime::task::raw::poll::hafbc00b5629743bc () + 1d
0000000003735146 tokio::runtime::task::raw::RawTask::poll::hbcd9b90a9f68e73f () + 26
0000000003742072 tokio::runtime::task::LocalNotified<S>::run::h82c13fc69d40bfa2 () + 22
00000000036c898c tokio::runtime::scheduler::multi_thread::worker::Context::run_task::{{closure}}::h9ecf202fb4aedd36 () + 1c
00000000036c8950 tokio::runtime::scheduler::multi_thread::worker::Context::run_task::h9f2957183dfc0a17 () + 120
00000000036c8634 tokio::runtime::scheduler::multi_thread::worker::Context::run::hb79d7578a4d720cc () + 1b4
00000000036c8424 tokio::runtime::scheduler::multi_thread::worker::run::{{closure}}::{{closure}}::h947c9cce25841432 () + 34
000000000370ff9f tokio::runtime::context::scoped::Scoped<T>::set::h589875c319f11e61 () + 5f
00000000036c547b tokio::runtime::context::set_scheduler::{{closure}}::h946422b065b2537b () + 2b
00000000036cd4a7 std::thread::local::LocalKey<T>::try_with::hbff304b678572a27 () + b7
00000000036cbaa1 std::thread::local::LocalKey<T>::with::h7e497ece18cd4db2 () + 11
00000000036c5440 tokio::runtime::context::set_scheduler::h92d580ae830637b9 () + 30
00000000036c83d8 tokio::runtime::scheduler::multi_thread::worker::run::{{closure}}::h23bf5525e61184da () + c8
000000000375a83e tokio::runtime::context::runtime::enter_runtime::h1093cfccbd6ee326 () + ae
00000000036c829b tokio::runtime::scheduler::multi_thread::worker::run::h6416b5163edbc5b8 () + 11b
00000000036c8171 tokio::runtime::scheduler::multi_thread::worker::Launch::launch::{{closure}}::hc6817b4d669b3e32 () + 11
0000000003707056 <tokio::runtime::blocking::task::BlockingTask<T> as core::future::future::Future>::poll::h234967f3a9f321ed () + 56
00000000036e37a5 tokio::runtime::task::core::Core<T,S>::poll::{{closure}}::h596f8a7421257a1b () + 75
00000000036e2afc tokio::runtime::task::core::Core<T,S>::poll::h7990afe31ca5173b () + 2c
00000000036b649f tokio::runtime::task::harness::poll_future::{{closure}}::h2b16ec35846ce065 () + 3f
0000000003746182 <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::hc3c3bbccb1345b38 () + 22
0000000003767a06 std::panicking::try::do_call::h80b93ac65f9ea34a () + 36
0000000003763c04 std::panicking::try::h62a93063541a4e4f () + 44
0000000003751c24 std::panic::catch_unwind::hf4eeead54b5fbc66 () + 14
00000000036b33b3 tokio::runtime::task::harness::poll_future::h33ea2f16a04c8f82 () + 73
00000000036a9906 tokio::runtime::task::harness::Harness<T,S>::poll_inner::h2151a92fefbb26a3 () + 86
00000000036a8dd5 tokio::runtime::task::harness::Harness<T,S>::poll::h6fa656e3a74f94bf () + 15
000000000373563d tokio::runtime::task::raw::poll::h6c0e9ed682716b6b () + 1d
0000000003735146 tokio::runtime::task::raw::RawTask::poll::hbcd9b90a9f68e73f () + 26
0000000003742106 tokio::runtime::task::UnownedTask<S>::run::hd214edb526cc3a17 () + 26
0000000003702109 tokio::runtime::blocking::pool::Task::run::ha15a3499ed14041a () + 19
000000000370643c tokio::runtime::blocking::pool::Inner::run::hc0d1077484ccb8f8 () + 11c
00000000037062c9 tokio::runtime::blocking::pool::Spawner::spawn_thread::{{closure}}::h9b828a85de1189c0 () + 39
00000000037376ed std::sys_common::backtrace::__rust_begin_short_backtrace::hcde0af8ac6e94548 () + d
0000000003714d50 std::thread::Builder::spawn_unchecked_::{{closure}}::{{closure}}::h682a7ee890499fa7 () + 30
0000000003745470 <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h1db3259947d8925b () + 30
0000000003767bc0 std::panicking::try::do_call::h95bd2eb6539cb660 () + 40
0000000003762b0b std::panicking::try::h1ff37b4fa23118c9 () + 5b
0000000003714be8 std::thread::Builder::spawn_unchecked_::{{closure}}::h13ddbdd7ae4a1097 () + 1f8
00000000036b7c71 core::ops::function::FnOnce::call_once{{vtable.shim}}::hb6cc2aa0206b9ae1 () + 11
0000000003adce5b std::sys::pal::unix::thread::Thread::new::thread_start::hc6a6dcaabc211891 () + 1b
fffff9ffeeb74f37 _thrp_setup (fffff9ffeec93a40) + 77
fffff9ffeeb75280 _lwp_start ()
I believe this was falsely closed because you included the phrase fix: #1498
in #1504 !
In CI (link may be bad) job: https://buildomat.eng.oxide.computer/wg/0/artefact/01J9KZDBT7Q76BMAZ3NFT2EM6G/JHuMk44VV03fuovquSYDTdL9uCEruARhbYRD7WhRUTif7Lam/01J9KZDV6QXGVBQG0ZHA6JNXTH/01J9M3TRMJHBD7KTGG9QWWK022/dsc-out.txt
We had the test timeout. Looking at test log output, we can see a new check we added is waiting for all the downstairs to respond:
The waiting for it waits till the test hits a timeout and we abort it. In the logs for
dsc
we can see it got toStarting
on client 0, but never toRunning
:A third bit of information, if I look at the output for client 0, port 8810, I do see that it started:
This suggests the problem is somewhere in the messages between different parts of
dsc
.