microsoft / durabletask-mssql

Microsoft SQL storage provider for Durable Functions and the Durable Task Framework
MIT License
87 stars 32 forks source link

Deadlock during long-haul stress test #79

Open cgillum opened 2 years ago

cgillum commented 2 years ago

The long-haul stress test running on v1.0.0-rc shows various deadlocks in the warning logs. Looking at the local SQL Server, all the deadlocks seem to have the same cause. Here is an example:

<deadlock>
 <victim-list>
  <victimProcess id="process1f720001088" />
 </victim-list>
 <process-list>
  <process id="process1f720001088" taskpriority="0" logused="0" waitresource="PAGE: 5:1:250251 " waittime="104" ownerId="33084415" transactionname="user_transaction" lasttranstarted="2022-04-19T21:54:03.373" XDES="0x1f7200a8428" lockMode="U" schedulerid="5" kpid="34052" status="suspended" spid="83" sbid="0" ecid="0" priority="0" trancount="2" lastbatchstarted="2022-04-19T21:54:03.373" lastbatchcompleted="2022-04-19T21:54:03.377" lastattention="1900-01-01T00:00:00.377" clientapp="TestHubName" hostname="{MACHINE_NAME}" hostpid="16064" loginname="{username}" isolationlevel="read committed (2)" xactid="33084415" currentdb="5" currentdbname="DurableDB" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="DurableDB.dt._LockNextOrchestration" line="25" stmtstart="1784" stmtend="3092" sqlhandle="0x03000500f01ab84a4ef3a30078ae000001000000000000000000000000000000000000000000000000000000">
UPDATE TOP (1) Instances WITH (READPAST)
    SET
        [LockedBy] = @LockedBy,
        [LockExpiration] = @LockExpiration,
        @instanceID = I.[InstanceID],
        @parentInstanceID = I.[ParentInstanceID],
        @version = I.[Version]
    FROM 
        dt.Instances I WITH (READPAST) INNER JOIN NewEvents E WITH (READPAST) ON
            E.[TaskHub] = @TaskHub AND
            E.[InstanceID] = I.[InstanceID]
    WHERE
        I.TaskHub = @TaskHub AND
        I.[RuntimeStatus] IN ('Pending', 'Running') AND
        (I.[LockExpiration] IS NULL OR I.[LockExpiration] &lt; @now) AND
        (E.[VisibleTime] IS NULL OR E.[VisibleTime] &lt; @now    </frame>
   </executionStack>
   <inputbuf>
Proc [Database Id = 5 Object Id = 1253579504]   </inputbuf>
  </process>
  <process id="process1f713fb4108" taskpriority="0" logused="1392" waitresource="KEY: 5:72057594043826176 (aa52126f61e5)" waittime="104" ownerId="33084446" transactionname="user_transaction" lasttranstarted="2022-04-19T21:54:03.380" XDES="0x1f70df48428" lockMode="X" schedulerid="8" kpid="53204" status="suspended" spid="51" sbid="0" ecid="0" priority="0" trancount="2" lastbatchstarted="2022-04-19T21:54:03.380" lastbatchcompleted="2022-04-19T21:54:03.377" lastattention="1900-01-01T00:00:00.377" clientapp="TestHubName" hostname="{MACHINE_NAME}" hostpid="16064" loginname="{username}" isolationlevel="read committed (2)" xactid="33084446" currentdb="5" currentdbname="DurableDB" lockTimeout="4294967295" clientoption1="671088672" clientoption2="128056">
   <executionStack>
    <frame procname="DurableDB.dt._CheckpointOrchestration" line="219" stmtstart="16706" stmtend="17362" sqlhandle="0x03000500293fac4b51f3a30078ae000001000000000000000000000000000000000000000000000000000000">
DELETE E
    OUTPUT DELETED.InstanceID, DELETED.SequenceNumber
    FROM dt.NewEvents E WITH (FORCESEEK(PK_NewEvents(TaskHub, InstanceID, SequenceNumber)))
        INNER JOIN @DeletedEvents D ON 
            D.InstanceID = E.InstanceID AND
            D.SequenceNumber = E.SequenceNumber AND
            E.TaskHub = @TaskHu    </frame>
   </executionStack>
   <inputbuf>
Proc [Database Id = 5 Object Id = 1269579561]   </inputbuf>
  </process>
 </process-list>
 <resource-list>
  <pagelock fileid="1" pageid="250251" dbid="5" subresource="FULL" objectname="DurableDB.dt.Instances" id="lock1f717da2b00" mode="IX" associatedObjectId="72057594043629568">
   <owner-list>
    <owner id="process1f713fb4108" mode="IX" />
   </owner-list>
   <waiter-list>
    <waiter id="process1f720001088" mode="U" requestType="wait" />
   </waiter-list>
  </pagelock>
  <keylock hobtid="72057594043826176" dbid="5" objectname="DurableDB.dt.NewEvents" indexname="PK_NewEvents" id="lock1f70fd18f00" mode="U" associatedObjectId="72057594043826176">
   <owner-list>
    <owner id="process1f720001088" mode="S" />
   </owner-list>
   <waiter-list>
    <waiter id="process1f713fb4108" mode="X" requestType="convert" />
   </waiter-list>
  </keylock>
 </resource-list>
</deadlock>

Here is the visual:

image

The deadlocked transactions are automatically retried and the orchestrations continue to run successfully, but this does create delays and also creates noise in the logs. Ideally there should be no deadlocks at all during the long-haul stress test.

bhugot commented 4 months ago

@cgillum you should try this again with last fix