Deadlock during long-haul stress test

The long-haul stress test running on v1.0.0-rc shows various deadlocks in the warning logs. Looking at the local SQL Server, all the deadlocks seem to have the same cause. Here is an example:

<deadlock>
 <victim-list>
  <victimProcess id="process1f720001088" />
 </victim-list>
 <process-list>
  <process id="process1f720001088" taskpriority="0" logused="0" waitresource="PAGE: 5:1:250251 " waittime="104" ownerId="33084415" transactionname="user_transaction" lasttranstarted="2022-04-19T21:54:03.373" XDES="0x1f7200a8428" lockMode="U" schedulerid="5" kpid="34052" status="suspended" spid="83" sbid="0" ecid="0" priority="0" trancount="2" lastbatchstarted="2022-04-19T21:54:03.373" lastbatchcompleted="2022-04-19T21:54:03.377" lastattention="1900-01-01T00:00:00.377" clientapp="TestHubName" hostname="{MACHINE_NAME}" hostpid="16064" loginname="{username}" isolationlevel="read committed (2)" xactid="33084415" currentdb="5" currentdbname="DurableDB" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="DurableDB.dt._LockNextOrchestration" line="25" stmtstart="1784" stmtend="3092" sqlhandle="0x03000500f01ab84a4ef3a30078ae000001000000000000000000000000000000000000000000000000000000">
UPDATE TOP (1) Instances WITH (READPAST)
    SET
        [LockedBy] = @LockedBy,
        [LockExpiration] = @LockExpiration,
        @instanceID = I.[InstanceID],
        @parentInstanceID = I.[ParentInstanceID],
        @version = I.[Version]
    FROM 
        dt.Instances I WITH (READPAST) INNER JOIN NewEvents E WITH (READPAST) ON
            E.[TaskHub] = @TaskHub AND
            E.[InstanceID] = I.[InstanceID]
    WHERE
        I.TaskHub = @TaskHub AND
        I.[RuntimeStatus] IN ('Pending', 'Running') AND
        (I.[LockExpiration] IS NULL OR I.[LockExpiration] &lt; @now) AND
        (E.[VisibleTime] IS NULL OR E.[VisibleTime] &lt; @now    </frame>
   </executionStack>
   <inputbuf>
Proc [Database Id = 5 Object Id = 1253579504]   </inputbuf>
  </process>
  <process id="process1f713fb4108" taskpriority="0" logused="1392" waitresource="KEY: 5:72057594043826176 (aa52126f61e5)" waittime="104" ownerId="33084446" transactionname="user_transaction" lasttranstarted="2022-04-19T21:54:03.380" XDES="0x1f70df48428" lockMode="X" schedulerid="8" kpid="53204" status="suspended" spid="51" sbid="0" ecid="0" priority="0" trancount="2" lastbatchstarted="2022-04-19T21:54:03.380" lastbatchcompleted="2022-04-19T21:54:03.377" lastattention="1900-01-01T00:00:00.377" clientapp="TestHubName" hostname="{MACHINE_NAME}" hostpid="16064" loginname="{username}" isolationlevel="read committed (2)" xactid="33084446" currentdb="5" currentdbname="DurableDB" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="DurableDB.dt._CheckpointOrchestration" line="219" stmtstart="16706" stmtend="17362" sqlhandle="0x03000500293fac4b51f3a30078ae000001000000000000000000000000000000000000000000000000000000">
DELETE E
    OUTPUT DELETED.InstanceID, DELETED.SequenceNumber
    FROM dt.NewEvents E WITH (FORCESEEK(PK_NewEvents(TaskHub, InstanceID, SequenceNumber)))
        INNER JOIN @DeletedEvents D ON 
            D.InstanceID = E.InstanceID AND
            D.SequenceNumber = E.SequenceNumber AND
            E.TaskHub = @TaskHu    </frame>
   </executionStack>
   <inputbuf>
Proc [Database Id = 5 Object Id = 1269579561]   </inputbuf>
  </process>
 </process-list>
 <resource-list>
  <pagelock fileid="1" pageid="250251" dbid="5" subresource="FULL" objectname="DurableDB.dt.Instances" id="lock1f717da2b00" mode="IX" associatedObjectId="72057594043629568">
   <owner-list>
    <owner id="process1f713fb4108" mode="IX" />
   </owner-list>
   <waiter-list>
    <waiter id="process1f720001088" mode="U" requestType="wait" />
   </waiter-list>
  </pagelock>
  <keylock hobtid="72057594043826176" dbid="5" objectname="DurableDB.dt.NewEvents" indexname="PK_NewEvents" id="lock1f70fd18f00" mode="U" associatedObjectId="72057594043826176">
   <owner-list>
    <owner id="process1f720001088" mode="S" />
   </owner-list>
   <waiter-list>
    <waiter id="process1f713fb4108" mode="X" requestType="convert" />
   </waiter-list>
  </keylock>
 </resource-list>
</deadlock>

Here is the visual:

The deadlocked transactions are automatically retried and the orchestrations continue to run successfully, but this does create delays and also creates noise in the logs. Ideally there should be no deadlocks at all during the long-haul stress test.

microsoft / durabletask-mssql

Deadlock during long-haul stress test #79