Azure / azure-cosmos-dotnet-v3

.NET SDK for Azure Cosmos DB for the core SQL API
MIT License
723 stars 477 forks source link

Diagnostics: Adds DurationInMs to StoreResult #4462

Closed j82w closed 1 month ago

j82w commented 2 months ago

Pull Request Template

Description

This PR adds the store result duration time to the diagnostics. The information was already being captured but was not added to the JSON diagnostics. This is useful to help determine how long each individual request to the backend took and the duration between each call.

Example of a strong write. This shows the initial write is response time is 2024-05-01T16:01:48.4597873Z. The barrier request response time is 2024-05-01T16:01:48.6634187Z. This gives 204ms between response time and the barrier request took 86ms leaving 172ms between the write response and the start of the barrier request. Before the diagnostics only had the transport start time so it was not clear where the latency was coming from.

"name": "Microsoft.Azure.Documents.ServerStoreModel Transport Request",
    "duration in milliseconds": 261.0978,
    "data": {
        "Client Side Request Stats": {
            "Id": "AggregatedClientSideRequestStatistics",
            "ContactedReplicas": [
                {
                    "Count": 1,
                    "Uri": "rntbd://test61-northcentralus1-be1.documents-test.windows-int.net:14300/apps/fbb1f3ba-0727-4d27-96d3-ae53d09fb41f/services/f9810a7f-02e5-49a5-9cf3-9381f3d82028/partitions/6106326a-79d5-453e-abe4-9031ea322d0f/replicas/133588946056943461p/"
                },
                {
                    "Count": 2,
                    "Uri": "rntbd://test61-northcentralus1-be1.documents-test.windows-int.net:14002/apps/fbb1f3ba-0727-4d27-96d3-ae53d09fb41f/services/f9810a7f-02e5-49a5-9cf3-9381f3d82028/partitions/6106326a-79d5-453e-abe4-9031ea322d0f/replicas/133588945885498657s/"
                },
                {
                    "Count": 1,
                    "Uri": "rntbd://test61-northcentralus1-be1.documents-test.windows-int.net:14003/apps/fbb1f3ba-0727-4d27-96d3-ae53d09fb41f/services/f9810a7f-02e5-49a5-9cf3-9381f3d82028/partitions/6106326a-79d5-453e-abe4-9031ea322d0f/replicas/133588945525024899s/"
                },
                {
                    "Count": 1,
                    "Uri": "rntbd://test61-northcentralus1-be1.documents-test.windows-int.net:14301/apps/fbb1f3ba-0727-4d27-96d3-ae53d09fb41f/services/f9810a7f-02e5-49a5-9cf3-9381f3d82028/partitions/6106326a-79d5-453e-abe4-9031ea322d0f/replicas/133588946013817394s/"
                }
            ],
            "RegionsContacted": [],
            "FailedReplicas": [],
            "AddressResolutionStatistics": [],
 "StoreResponseStatistics": [
    {
        "ResponseTimeUTC": "2024-05-01T16:01:48.4597873Z",
        "DurationInMs": 55.9657,
        "ResourceType": "Document",
        "OperationType": "Create",
        "LocationEndpoint": "https://test-4-northcentralus.documents-test.windows-int.net/",
        "StoreResult": {
            "ActivityId": "5f73be56-4e58-4f0e-a854-bd330c702eeb",
            "StatusCode": "Created",
            "SubStatusCode": "Unknown",
            "LSN": 454110,
            "PartitionKeyRangeId": "0",
            "GlobalCommittedLSN": 454108,
            "ItemLSN": -1,
            "UsingLocalLSN": false,
            "QuorumAckedLSN": 454109,
            "SessionToken": "-1#454110",
            "CurrentWriteQuorum": 3,
            "CurrentReplicaSetSize": 4,
            "NumberOfReadRegions": 2,
            "IsValid": true,
            "StorePhysicalAddress": "rntbd://northcentralus1-be1.documents-test.windows-int.net:14300/apps/fbb1f3ba-0727-4d27-96d3-ae53d09fb41f/services/f9810a7f-02e5-49a5-9cf3-9381f3d82028/partitions/6106326a-79d5-453e-abe4-9031ea322d0f/replicas/133588946056943461p/",
            "RequestCharge": 25.14,
            "RetryAfterInMs": null,
            "BELatencyInMs": "5.989",
            "ReplicaHealthStatuses": [
                "(port: 14300 | status: Connected | lkt: 5/1/2024 3:59:31 PM)"
            ],
            "transportRequestTimeline": {
                "requestTimeline": [
                    {
                        "event": "Created",
                        "startTimeUtc": "2024-05-01T16:01:48.4038222Z",
                        "durationInMs": 0.0017
                    },
                    {
                        "event": "ChannelAcquisitionStarted",
                        "startTimeUtc": "2024-05-01T16:01:48.4038239Z",
                        "durationInMs": 0.0018
                    },
                    {
                        "event": "Pipelined",
                        "startTimeUtc": "2024-05-01T16:01:48.4038257Z",
                        "durationInMs": 1.2214
                    },
                    {
                        "event": "Transit Time",
                        "startTimeUtc": "2024-05-01T16:01:48.4050471Z",
                        "durationInMs": 32.5846
                    },
                    {
                        "event": "Received",
                        "startTimeUtc": "2024-05-01T16:01:48.4376317Z",
                        "durationInMs": 9.5503
                    },
                    {
                        "event": "Completed",
                        "startTimeUtc": "2024-05-01T16:01:48.4471820Z",
                        "durationInMs": 0
                    }
                ],
                "serviceEndpointStats": {
                    "inflightRequests": 1,
                    "openConnections": 1
                },
                "connectionStats": {
                    "waitforConnectionInit": "False",
                    "callsPendingReceive": 0,
                    "lastSendAttempt": "2024-05-01T16:01:48.1442807Z",
                    "lastSend": "2024-05-01T16:01:48.1447075Z",
                    "lastReceive": "2024-05-01T16:01:48.1758125Z"
                },
                "requestSizeInBytes": 41678,
                "requestBodySizeInBytes": 41226,
                "responseMetadataSizeInBytes": 493
            },
            "TransportException": null
        }
    },
    {
        "ResponseTimeUTC": "2024-05-01T16:01:48.6634187Z",
        "DurationInMs": 31.6043,
        "ResourceType": "Collection",
        "OperationType": "Head",
        "LocationEndpoint": "https://test-4-northcentralus.documents-test.windows-int.net/",

Type of change

Please delete options that are not relevant.

Closing issues

To automatically close an issue: closes #IssueNumber

ealsur commented 2 months ago

and can help show thread starvation for requests ran in parallel like strong reads

How can this be used? Is the Start Time captured before the Task is created or after? If it's before, should this then be compared with the Create on the request timeline?

j82w commented 2 months ago

How can this be used? Is the Start Time captured before the Task is created or after? If it's before, should this then be compared with the Create on the request timeline?

The start time gets set before the task is created. I updated the description.