Closed jmpesp closed 10 months ago
Adding a few probes:
@@ -280,13 +283,14 @@ impl<T> Tracking<T> {
/// Record keeping for [`NoneOutstanding`] futures emitted by [`Tracking`]
struct TrackingWait {
+ device_id: u64,
empty: bool,
gen: usize,
wake: Vec<Waker>,
}
impl TrackingWait {
- fn new() -> Self {
- Self { empty: true, gen: 1, wake: Vec::new() }
+ fn new(device_id: u64) -> Self {
+ Self { device_id, empty: true, gen: 1, wake: Vec::new() }
}
fn set_empty(&mut self) {
self.empty = true;
@@ -320,9 +324,14 @@ impl Future for NoneOutstanding {
cx: &mut Context<'_>,
) -> Poll<Self::Output> {
let mut wguard = self.wait.lock().unwrap();
+ probes::none_outstanding_poll!(|| (wguard.device_id));
+
if wguard.empty {
+ probes::none_outstanding_ready!(|| (wguard.device_id));
Poll::Ready(())
} else {
+ probes::none_outstanding_must_wait!(|| (wguard.device_id));
+
// Add us to the waker list if the TrackingWait generation is newer
// than this future. That list is cleaned up by draining its
// entries when waking them (forcing interested futures to re-add
Shows:
device id 1 none outstanding poll
device id 1 none outstanding ready!
device id 2 none outstanding poll
device id 2 none outstanding must wait
device id 3 none outstanding poll
device id 3 none outstanding ready!
The instance spec shows the following for storage devices:
storage_devices: {
"doom-disk": NvmeDisk(
NvmeDisk {
backend_name: "doom-disk",
pci_path: PciPath {
bus: 0,
device: 17,
function: 0,
},
},
),
"doom-install": NvmeDisk(
NvmeDisk {
backend_name: "doom-install",
pci_path: PciPath {
bus: 0,
device: 18,
function: 0,
},
},
),
"cloud-init": VirtioDisk(
VirtioDisk {
backend_name: "cloud-init",
pci_path: PciPath {
bus: 0,
device: 24,
function: 0,
},
},
),
},
Device id 2 probably (?) corresponds to doom-install
[^1], the second entry in this list, which is the nvme device for which the pause future isn't returning.
[^1]: I'm naming instances after id software games these days
Printing the results of the existing block traces:
propolis$target:::block_begin_read
{
printf("device id %d begin read %d\n", args[0], args[1]);
}
propolis$target:::block_begin_write
{
printf("device id %d begin write %d\n", args[0], args[1]);
}
propolis$target:::block_begin_flush
{
printf("device id %d begin flush %d\n", args[0], args[1]);
}
propolis$target:::block_complete_read
{
printf("device id %d complete read %d\n", args[0], args[1]);
}
propolis$target:::block_complete_write
{
printf("device id %d complete write %d\n", args[0], args[1]);
}
propolis$target:::block_complete_flush
{
printf("device id %d complete flush %d\n", args[0], args[1]);
}
and eating that with some Python shows that there are eight writes for device two outstanding:
{
"1": {},
"2": {
"35490": "device id 2 begin write 35490",
"35491": "device id 2 begin write 35491",
"35492": "device id 2 begin write 35492",
"35493": "device id 2 begin write 35493",
"35494": "device id 2 begin write 35494",
"35495": "device id 2 begin write 35495",
"35496": "device id 2 begin write 35496",
"35497": "device id 2 begin write 35497"
},
"3": {}
}
More probes, this time for the Crucible process_request function:
@@ -291,7 +298,48 @@ impl From<Error> for block::Result {
}
}
+#[usdt::provider(provider = "propolis")]
+mod probes {
+ fn crucible_begin_request(req_id: u64) {}
+ fn crucible_complete_ok_request(req_id: u64) {}
+ fn crucible_complete_err_request(req_id: u64) {}
+}
+
async fn process_request(
+ log: &Logger,
+ block: &(dyn BlockIO + Send + Sync),
+ read_only: bool,
+ skip_flush: bool,
+ req: &block::Request,
+ mem: &MemCtx,
+) -> Result<(), Error> {
+ if let Some(req_id) = req.id() {
+ probes::crucible_begin_request!(|| (req_id));
+ }
+
+ match inner_process_request(block, read_only, skip_flush, req, mem).await {
+ Ok(()) => {
+ if let Some(req_id) = req.id() {
+ probes::crucible_complete_ok_request!(|| (req_id));
+ }
+
+ Ok(())
+ }
+
+ Err(e) => {
+ if let Some(req_id) = req.id() {
+ error!(log, "request {} error {:?}", req_id, e);
+ probes::crucible_complete_err_request!(|| (req_id));
+ } else {
+ error!(log, "request ?? error {:?}", e);
+ }
+
+ Err(e)
+ }
+ }
+}
+
+async fn inner_process_request(
block: &(dyn BlockIO + Send + Sync),
read_only: bool,
skip_flush: bool,
Consuming that output with some more Python shows that it's the Crucible requests that are not completing:
device:
{
"2": {},
"3": {
"305617": "device id 3 begin write 305617",
"305618": "device id 3 begin write 305618",
"305619": "device id 3 begin write 305619",
"305620": "device id 3 begin write 305620",
"305621": "device id 3 begin write 305621",
"305622": "device id 3 begin write 305622",
"305623": "device id 3 begin write 305623",
"305624": "device id 3 begin write 305624"
},
"1": {}
}
crucible:
{
"305617": "crucible begin request 305617",
"305618": "crucible begin request 305618",
"305619": "crucible begin request 305619",
"305620": "crucible begin request 305620",
"305621": "crucible begin request 305621",
"305622": "crucible begin request 305622",
"305623": "crucible begin request 305623",
"305624": "crucible begin request 305624"
}
Also interesting to note that it's device id 3 this time.
If we hand off IO requests to crucible, and never get completions for them, I would expect the nvme device (or really any block emulation) to act this way.
I do plan to add the ability for the device emulation to request cancellation of pending requests, but it will be incumbent on the backend to pay heed to such requests which is a potentially complicated endeavor. (Also, any backend which is holding "stalled" requests is probably unlikely to make forward progress on them, if they were to be marked as cancelled)
I think there's something more subtle going on. If I add print statements for Crucible's guest work probes:
crucible_upstairs$target:::gw-read-start
{
printf("gw_id %d read start\n", args[0]);
}
crucible_upstairs$target:::gw-read-done
{
printf("gw_id %d read done\n", args[0]);
}
crucible_upstairs$target:::gw-write-start
{
printf("gw_id %d write start\n", args[0]);
}
crucible_upstairs$target:::gw-write-done
{
printf("gw_id %d write done\n", args[0]);
}
crucible_upstairs$target:::gw-flush-start
{
printf("gw_id %d flush start\n", args[0]);
}
crucible_upstairs$target:::gw-flush-done
{
printf("gw_id %d flush done\n", args[0]);
}
And eat that into the Python script, I see:
device:
{
"3": {},
"2": {
"43221": "device id 2 begin write 43221",
"43222": "device id 2 begin write 43222",
"43223": "device id 2 begin write 43223",
"43224": "device id 2 begin write 43224",
"43225": "device id 2 begin write 43225",
"43226": "device id 2 begin write 43226",
"43227": "device id 2 begin write 43227",
"43228": "device id 2 begin write 43228"
},
"1": {}
}
crucible:
{
"43221": "crucible begin request 43221",
"43222": "crucible begin request 43222",
"43223": "crucible begin request 43223",
"43224": "crucible begin request 43224",
"43225": "crucible begin request 43225",
"43226": "crucible begin request 43226",
"43227": "crucible begin request 43227",
"43228": "crucible begin request 43228"
}
guest_work:
{
"22888": "gw_id 22888 flush start",
"30731": "gw_id 30731 flush start",
"37505": "gw_id 37505 flush start",
"42794": "gw_id 42794 flush start"
}
From Crucible's perspective it looks like all writes completed where four flushes did not complete. From propolis' perspective, it started 8 writes but didn't receive completions for them. I'm going to keep digging.
This feels like a cancel-safety issue: some Crucible future is being dropped and causing trouble. I tried spawning a task in Crucible's process_loop
but that didn't seem to fix it:
@@ -24,16 +24,21 @@ pub use nexus_client::Client as NexusClient;
pub struct CrucibleBackend {
state: Arc<WorkerState>,
}
+
struct WorkerState {
attachment: block::backend::Attachment,
volume: Volume,
info: block::DeviceInfo,
skip_flush: bool,
+ log: Logger,
}
+
impl WorkerState {
async fn process_loop(&self, acc_mem: MemAccessor) {
let read_only = self.info.read_only;
let skip_flush = self.skip_flush;
+ let acc_mem = Arc::new(acc_mem);
+
loop {
let req = match self.attachment.wait_for_req().await {
Some(r) => r,
@@ -42,13 +47,22 @@ impl WorkerState {
break;
}
};
- let res = if let Some(memctx) = acc_mem.access() {
- match process_request(
- &self.volume,
+
+ // Spawn a task that will handle this request. crucible is not
+ // cancel-safe, so we must proceed until completion even if the
+ // process_loop task is dropped.
+ let log = self.log.clone();
+ let volume = self.volume.clone();
+ let acc_mem = acc_mem.clone();
+
+ let _ = tokio::spawn(async move {
+ let res = match process_request(
+ &log,
+ &volume,
read_only,
skip_flush,
&req,
- &memctx,
+ acc_mem,
)
.await
{
@@ -58,11 +72,10 @@ impl WorkerState {
assert!(mapped.is_err());
mapped
}
- }
- } else {
- block::Result::Failure
- };
- req.complete(res);
+ };
+
+ req.complete(res);
+ }).await;
}
}
}
WorkerState::process_loop
is itself already run in a separate spawned task so that would explain the same behaviour with your change
https://github.com/oxidecomputer/propolis/blob/26806e8d48fccd4a1d5f20f1f8bd77d3aa23b0db/lib/propolis/src/block/crucible.rs#L235-L237
From Crucible's perspective it looks like all writes completed where four flushes did not complete. From propolis' perspective, it started 8 writes but didn't receive completions for them. I'm going to keep digging.
Is that crucible downstairs or upstairs you mean that completed the writes?
Is that crucible downstairs or upstairs you mean that completed the writes?
"Guest work" is tracked Upstairs, and tracks all the downstairs work. When all that downstairs work is done, the guest work is done. See https://github.com/oxidecomputer/crucible/blob/2e02507fb1625ee392ca9a0063a4a62793597648/upstairs/src/lib.rs#L9003
It just occurred to me that I was testing Matt's one-task-to-rule
branch, and not main
... I'm respinning with main
now to see if this reproduces there too.
It just occurred to me that I was testing Matt's
one-task-to-rule
branch, and notmain
... I'm respinning withmain
now to see if this reproduces there too.
I think some form of it probably does as this looks similar to https://github.com/oxidecomputer/customer-support/issues/47
Unfortunately it hasn't reproduced yet.
It took running five fio processes with the same workload with 10x the io depth, but I managed to reproduce Linux kicking the disk out on main
. Problem is that sudo poweroff
halted everything OK though.
Here's the fio workload for posterity:
[global]
filename=/dev/nvme1n1
iodepth=25
ioengine=aio
time_based
runtime=60
numjobs=1
direct=1
stonewall=1
[randwrite-4K]
bs=4K
rw=randwrite
[randwrite-1M]
bs=1M
rw=randwrite
[randwrite-4M]
bs=4M
rw=randwrite
Pointing the same Python at the dtrace output for the main
reproduction shows
device:
{
"2": {},
"3": {},
"1": {}
}
crucible:
{}
guest_work:
{}
indicating every request was honoured.
Hmm, so something about the one-task-to-rule
branch is causing IO requests to hang (in a more reproducible manner at the very least). In either case, we definitely need to be able to handle shutting down with hung requests but like patrick said that's a more complicated endeavor
I believe that the "easier IO hangs" is due to counter-intuitive interaction between backpressure and max latency. As of https://github.com/oxidecomputer/crucible/commit/41f361ac725c0e7bcb3b9ce28238741d12deb620, I successfully ran 5x simultaneous fio
tests (with iodepth=250
) without the disk being kicked out.
I'm going to close this, I think it's covered by discovering that the mpsc
channel was deadlocked.
EDIT: I commented before seeing Matt's message! Defer to that instead.
If Crucible takes too long to process some IO, Linux resets the NVMe controller, and eventually disables the controller and kicks out the disk:
I then tried to
sudo poweroff
from within the instance, and it's stuck waiting for thepci-nvme-0.18.0
pause future to complete:This instance is stuck in the
Stopping
state