instance ticket teardown seems racy & might be causing unexpected concurrent operations on Propolis zones

I'm continuing to peel the onion on sled agent failures caused by instance API stress and have now gotten to a class of issues like the following:

23:31:42.747Z WARN SledAgent (InstanceManager): Halting and removing zone: oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9                                                                                                                                                                                          
    instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       
23:31:43.255Z INFO SledAgent (InstanceManager): ensuring instance is registered                                                                                                                                                                                                                                              
    instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       
    propolis_id = d58859db-b632-4d1a-9479-1a44728a0bf9                                                                                                                                                                                                                                                                       
23:31:46.012Z INFO SledAgent (InstanceManager): halt_and_remove_logged: Previous zone state: Running                                                                                                                                                                                                                         
    instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       
23:31:46.031Z INFO SledAgent (InstanceManager): State monitoring task complete                                                                                                                                                                                                                                               
    instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       
23:31:46.031Z INFO SledAgent (InstanceManager): Publishing instance state update to Nexus                                                                                                                                                                                                                                    
    instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       
    state = InstanceRuntimeState { run_state: Starting, sled_id: bb07afed-b435-4260-97b3-0e4e32b499b6, propolis_id: d58859db-b632-4d1a-9479-1a44728a0bf9, dst_propolis_id: None, propolis_addr: Some([fd00:1122:3344:101::4e]:12400), migration_id: None, propolis_gen: Generation(2), ncpus: InstanceCpuCount(1), memory: By
teCount(1073741824), hostname: "inst1", gen: Generation(8), time_updated: 2023-06-08T23:31:46.031673699Z }                                                                                                                                                                                                                   
23:31:46.089Z INFO SledAgent (InstanceManager): Configuring new Omicron zone: oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9                                                                                                                                                                                       
    instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       
23:31:46.110Z INFO SledAgent (InstanceManager): Installing Omicron zone: oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9                                                                                                                                                                                            
    instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       
23:31:46.869Z INFO SledAgent (InstanceManager): ensuring instance is registered                                                                                                                                                                                                                                                  instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                           propolis_id = d58859db-b632-4d1a-9479-1a44728a0bf9                                                                                                                                                                                                                                                                       23:31:46.869Z INFO SledAgent (InstanceManager): registering new instance                                                                                                                                                                                                                                                         instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       23:31:46.903Z INFO SledAgent (InstanceManager): Publishing instance state update to Nexus                                                                                                                                                                                                                                        instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                           state = InstanceRuntimeState { run_state: Starting, sled_id: bb07afed-b435-4260-97b3-0e4e32b499b6, propolis_id: d58859db-b632-4d1a-9479-1a44728a0bf9, dst_propolis_id: None, propolis_addr: Some([fd00:1122:3344:101::4e]:12400), migration_id: None, propolis_gen: Generation(2), ncpus: InstanceCpuCount(1), memory: ByteCount(1073741824), hostname: "inst1", gen: Generation(9), time_updated: 2023-06-08T23:31:46.903342690Z }                                                                                                                                                                                                                   23:31:46.960Z INFO SledAgent (InstanceManager): install_omicron_zone: Found zone: oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9 in state Incomplete                                                                                                                                                                   instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       23:31:46.960Z INFO SledAgent (InstanceManager): Invalid state; uninstalling and deleting zone oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9                                                                                                                                                                           instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                       23:31:49.662Z INFO SledAgent (InstanceManager): Adding service                                                                                                                                                                                                                                                                   instance_id = 058db778-8513-40ea-af9f-6309f6af8826                                                                                                                                                                                                                                                                           smf_name = svc:/system/illumos/propolis-server:vm-f7b82f0f-4356-4431-aa06-1742eb5c90fb                                                                                                                                                                                                                                   23:31:49.682Z INFO SledAgent (InstanceManager): Adding service property group 'config'                                                                                                                                                                                                                                           instance_id = 058db778-8513-40ea-af9f-6309f6af8826                                                                                                                                                                                                                                                                       23:31:50.949Z INFO SledAgent (InstanceManager): Zone booting                                                                                                                                                                                                                                                                     instance_id = f8a059c0-f6f9-499d-b452-8358c4d72650                                                                                                                                                                                                                                                                           zone = oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9                                                                                                                                                                                                                                                          23:31:50.962Z ERRO SledAgent (InstanceManager): instance setup failed: Err(ZoneBoot(Booting(AdmError { op: Boot, zone: "oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9", err: CommandOutput(CommandOutputError("exit code 1\nstdout:\n\nstderr:\nzoneadm: zone 'oxz_propolis-server_d58859db-b632-4d1a-9479-1a44728a0bf9': must be installed before boot.")) })))

In the best case, this leaves users with Failed instances. In the worst case, sled agent might unwrap a zoneadm command it expects to succeed (see e.g. #3264) and knock out an entire sled agent.

ILTM like there are a couple of races in sled agent that can lead to this sort of problem. One of them is as follows:

An instance starts and then stops.
Thread T1: The instance's Propolis VMM shuts down; state monitor calls Instance::terminate, which takes the instance lock and then calls InstanceInner::terminate.
T2: In parallel a call to ensure the instance is running arrives. This call takes the instance manager lock, finds that the instance is registered on the target sled (in the guise of the Instance that's being torn down), clones a reference to that instance, drops the instance manager lock, and then waits to take the instance lock.
T1: InstanceInner::terminate calls instance_ticket.terminate(), which removes the instance from the instance manager.
T3: A second parallel call to ensure the instance is running also arrives. This does not find the instance in the instance manager and so creates a brand new generation of it.
T1 finishes cleaning up the unused instance, drops the instance lock, and exits.
T2 enters Instance::put_state and takes the instance lock for its generation of the instance. It then calls propolis_ensure, sees that there is no running_zone (it was consumed by T1 in the previous step), and so starts trying to install a new Propolis zone.
T3 does the same thing for its generation of the instance. Note that this is not excluded by the lock that T2 holds because they have distinct Instance structures!
One of T2 or T3 gets upset because the zone already exists/is in the wrong state.
Hilarity ensues.

I've been mulling this over some and have some thoughts. (Fair warning: synchronization-related rambling ahead!)

ISTM the fundamental problem here is that it's possible to remove an instance from the instance manager's table while it still has active references. (This is done through the InstanceTicket::terminate routine.) Instead we could move toward a scheme like this (N.B. some of the details here may need to be massaged a bit):

The instance manager only has a weak reference to each individual Instance
If an instance has a running Propolis, the associated runtime state (i.e. the instance::RunningState) holds a strong reference to the Instance; this is roughly tied to the lifetime of the monitor task for the instance, which runs until the instance is destroyed and then exits
When an Instance is dropped, it removes itself from the instance table (we want the weak ref to go away so the underlying memory can be freed even if no one refers to the instance again)

This helps to preserve the invariant we want: if someone has an active strong reference to an instance, then the instance should be present in the instance manager.

But there's a catch: to fix #2209, I changed the code that incarnates an instance on a sled so that it operates in two phases: first Nexus calls to register the instance on the sled (i.e. create an instance manager entry for it), and then it calls a different API to ensure there's a running Propolis for that instance. This works today because the instance manager holds a strong reference to the Instance that persists after the registration call returns. If we change instance manager references to weak refs, the Instance will get dropped as soon as the registration call comes back, which will cause the later "start" step to fail. We can't simply re-register the instance in response to a "start" call, either, because that will just reproduce #2209.

It might be that we need to take a somewhat different approach to starting instances to avoid all of the possible problems here. I have an idea for what that could look like, but haven't fleshed it all out yet and so will save it for a subsequent comment.

I have an idea for what that could look like, but haven't fleshed it all out yet and so will save it for a subsequent comment.

The general idea here is to change instance start so that launching an instance first requires the relevant saga to obtain a "start token" that holds a strong reference to the Instance that the token's owner intends to start. The token is identified by a UUID that the saga generates and then sends to sled agent (to ensure that the saga can refer to the same token even if some of its steps are retried). A token can be redeemed to start an instance only once; subsequent attempts to redeem a previously-redeemed token succeed for idempotency purposes (after which the saga can explicitly release them).

This requires instance start to be a saga (#2824), because that's the only Nexus context in which it's safe to create tokens without fear of leaking them.

This should also work for live migration (sled agent already treats migration as a special case of instance start that has some extra parameters).

I wrote up TLA+ specs that model the existing instance ticket behavior (in theory, anyway) and the scheme I'm proposing here. The existing behavior quickly runs afoul of the "only one active generation of an instance at once" invariant I set for it. The proposed behavior passes model checking with 4 concurrent start attempts and a concurrent process that stops the instance if it's started (to release the RunningState strong ref described above).

Here's the extremely rough (but hopefully ready!) PlusCal spec for the start-token design. The definitions from GenInTableInv on down are invariants/properties the model upholds (checked for 4 sagas):

---- MODULE instance_table_2 ----

EXTENDS TLC, Integers, FiniteSets, Sequences

CONSTANT NULL, SAGAS

(* --algorithm instance_table_2 {

variables
    \* The instance generation that's currently present in the table.
    gen_in_table = NULL;

    \* The current owner of the table lock, or NULL if the lock isn't held.
    lock_owner = NULL;

    \* The next instance generation to assign. This is a proxy for having
    \* different Instance structures on the sled agent's heap.
    gen = 0;

    \* Strong and weak reference counts for each Instance structure.
    strong_refs = <<>>;
    weak_refs = <<>>;

    \* The set of "start tokens" known to the sled agent. Tokens are tuples
    \* from the instance-start saga ID to a specific instance generation. They
    \* serve to keep an instance alive and in the table while the saga is
    \* between the "register" and "ensure" steps.
    start_tokens = {};

    \* The instance generations for which an instance is active. An active
    \* instance holds a strong reference to itself until it becomes inactive.
    active_gens = {};

define {
    \* Utility operators for working with start tokens.
    StartTokensForSaga(s) == {t \in start_tokens: t[1] = s}
    StartToken(s) == CHOOSE t \in start_tokens: t[1] = s
    StartTokenGen(s) == StartToken(s)[2]

    \* Permutation of sagas used to define symmetry sets for model checking.
    SagaPermutations == Permutations(SAGAS)

    \* Instance generations must have been seen before to be put in the table.
    GenInTableInv == (gen_in_table = NULL) \/ (gen_in_table < gen)

    \* Ensure a single saga doesn't have start tokens pointing at multiple
    \* instance structures.
    OneStartTokenPerSaga ==
        \A first, second \in start_tokens:
            first[1] = second[1] => first[2] = second[2]

     \* The lock can only go to or from the NULL state.
    LockAcquiredOnlyWhenUnlocked == [][lock_owner' # NULL => lock_owner = NULL]_lock_owner
    LockReleasedOnlyWhenLocked == [][lock_owner' = NULL => lock_owner # NULL]_lock_owner

    \* Reference counts must always be non-negative.
    StrongRefsNotNegative == \A i \in DOMAIN strong_refs: strong_refs[i] >= 0
    WeakRefsNotNegative == \A i \in DOMAIN weak_refs: weak_refs[i] >= 0

    \* An instance can't go from having no strong references to some strong
    \* references.
    DeletedItemsRemainDeleted ==
        [][\A i \in DOMAIN strong_refs: strong_refs[i] = 0 => strong_refs'[i] = 0]_strong_refs

    \* Only one generation should have a strong reference active at once. If
    \* this is not true, there are multiple threads with references to conflicting
    \* generations.
    OnlyOneAlive ==
        \A i, j \in DOMAIN strong_refs:
            (strong_refs[i] > 0 /\ strong_refs[j] > 0) => i = j

    \* A variant of the previous invariant: if an instance has a strong reference,
    \* then it must be pointed to by the instance table. Note that the converse
    \* is not true: the table itself holds only a weak reference, so it's permitted
    \* to have an instance in the table but have no strong references to it.
    StrongRefImpliesTable == \A i \in DOMAIN strong_refs:
        strong_refs[i] > 0 => gen_in_table = i

    \* If the table has an instance in it, it must hold at least some kind of
    \* reference to the instance.
    TableHoldsSomeReference ==
        gen_in_table # NULL =>
            (strong_refs[gen_in_table] > 0 \/ weak_refs[gen_in_table] > 0)

    \* All start tokens should be released by the time all processes have exited.
    AllSagaTokensReleased == (\A self \in DOMAIN pc: pc[self] = "Done") =>
        start_tokens = {}
}

macro acquire_lock() {
    assert(lock_owner # self);
    await lock_owner = NULL;
    lock_owner := self;
}

macro release_lock() {
    assert(lock_owner = self);
    lock_owner := NULL;
}

macro add_start_token(g) {
    assert(<<self, g>> \notin start_tokens);
    start_tokens := start_tokens \union {<<self, g>>};
}

procedure drop_ref_outside_lock(to_drop)
variables old_refcount
{
    deref_1:
        old_refcount := strong_refs[to_drop];
        strong_refs[to_drop] := strong_refs[to_drop] - 1;
    deref_2:
        if (old_refcount = 1) {
            acquire_lock();
            deref_3:
                \* We have to compare explicitly to see if the instance in
                \* the table is the one we're dropping. Otherwise, we can
                \* hit a race where some other thread failed to upgrade the
                \* weak ref in the table and replaced it with a new instance,
                \* and then we yank out the new instance. In Rust this would
                \* involve a `ptr_eq` on the relevant Arc/Weak.
                if (to_drop = gen_in_table) {
                    gen_in_table := NULL;
                };
                weak_refs[to_drop] := weak_refs[to_drop] - 1;
            deref_4: release_lock();
        } else {
            skip;
        };
    deref_ret:
        return;
}

fair process (StartSaga \in SAGAS)
variable to_deref = NULL;
{
    ss_register_1: acquire_lock();

    \* If the instance isn't registered on the sled, or it's in the instance
    \* table but has no strong references (because the last strong ref has
    \* been released but the instance is not yet dropped), create a new
    \* generation of the instance and a start token for this saga.
    \*
    \* In the Rust implementation, the start tokens are in their own table
    \* mapping a `Uuid` (generated in the saga) to an `Arc<Instance>`. Here
    \* we substitute the saga process ID for the UUID.
    \*
    \* If the instance is registered on the sled and there's no start token
    \* for this specific saga, create a token for this saga.
    \*
    \* Otherwise do nothing for idempotency.
    ss_register_2:
        if (gen_in_table = NULL \/ strong_refs[gen_in_table] = 0) {
            assert(gen \notin DOMAIN strong_refs);
            assert(gen \notin DOMAIN weak_refs);

            gen_in_table := gen;
            strong_refs := strong_refs @@ gen :> 1;
            weak_refs := weak_refs @@ gen :> 1;
            add_start_token(gen);
            gen := gen + 1;
        } else if (StartTokensForSaga(self) = {}) {
            add_start_token(gen_in_table);
            strong_refs[gen_in_table] := strong_refs[gen_in_table] + 1;
        } else {
            skip;
        };
    ss_register_3: release_lock();
    \* Allow this saga node to be retried.
    ss_register_4: either { goto ss_register_1; } or { skip; };

    \* If there is a start token for this saga attempt, redeem it.
    ss_start_1:
        if (StartTokensForSaga(self) # {}) {
            if (StartTokenGen(self) \notin active_gens) {
                active_gens := active_gens \union {StartTokenGen(self)};
                strong_refs[StartTokenGen(self)] :=
                    strong_refs[StartTokenGen(self)] + 1;
            };

            start_tokens := start_tokens \ {StartToken(self)};
            to_deref := StartTokenGen(self);
        } else {
            to_deref := NULL;
        };
    ss_start_2:
        if (to_deref # NULL) {
            call drop_ref_outside_lock(to_deref);
        };
    \* Allow this saga node to be retried.
    ss_start_3: either { goto ss_start_1; } or { skip; };
}

\* Once an instance is registered in the table, there are various forces that
\* can cause it to be unregistered--an explicit unregistration call, the death
\* of the internal VM, etc. Emulate their effects here.
fair process (Terminator \in {"Terminator"})
variables to_terminate;
{
    t_await_active: either {
        await active_gens # {};
        t_terminate: with (target \in active_gens) {
            to_terminate := target;
            active_gens := active_gens \ {target};
        };
        t_drop_ref:
            call drop_ref_outside_lock(to_terminate);
            goto t_await_active;
    } or {
        skip;
    }
}

} end algorithm; *)

oxidecomputer / omicron

instance ticket teardown seems racy & might be causing unexpected concurrent operations on Propolis zones #3325