grpc / grpc-node

gRPC for Node.js
https://grpc.io
Apache License 2.0
4.47k stars 647 forks source link

[Question] How to configure retry policy for grpc-js? #2342

Closed i7N3 closed 1 year ago

i7N3 commented 1 year ago

Background

grpc.helper.ts

import * as assert from 'assert';
import * as grpc from '@grpc/grpc-js';
import { GoogleAuth } from 'google-auth-library';
import {
    Closeable,
    ClientProxy,
    GrpcOptions,
    ClientProxyFactory,
} from '@nestjs/microservices';

export async function googleCloudGrpcConfigurator(
    targetGrpcServer: GrpcOptions,
): Promise<ClientProxy & Closeable> {
    assert(targetGrpcServer.options, 'no options provided');
    assert(targetGrpcServer.options.package, 'no package provided');
    assert(targetGrpcServer.options.protoPath, 'no protopath provided');

    const audience = targetGrpcServer.options.url.split(':')[0];
    const target = `https://${audience}`;
    const idTokenClient = await new GoogleAuth().getIdTokenClient(target);
    const channelCredentials = grpc.credentials.createSsl();

    const creds = grpc.credentials.createInsecure(); // For local development
    const callCreds =
        grpc.credentials.createFromGoogleCredential(idTokenClient);
    targetGrpcServer.options.credentials =
        grpc.credentials.combineChannelCredentials(
            creds,
            // channelCredentials,
            // callCreds,
        );

    return ClientProxyFactory.create({
        transport: targetGrpcServer.transport,
        options: {
            ...targetGrpcServer.options,
            channelOptions: {
                ...(targetGrpcServer.options?.channelOptions || {}),
                'grpc.service_config': JSON.stringify({
                    methodConfig: [
                        {
                            name: [
                                { service: targetGrpcServer.options.package },
                            ],
                            retryPolicy: {
                                maxAttempts: 4,
                                initialBackoff: '1s',
                                maxBackoff: '10s',
                                backoffMultiplier: 2,
                                retryableStatusCodes: ['UNAVAILABLE'],
                            },
                        },
                    ],
                }),
            },
        },
    });
}

app.module.ts

@Module({
    imports: [...],
    controllers: [...],
    providers: [
        ...
        {
            inject: [ConfigService],
            provide: GRPC_PACKAGE_NAME,
            useFactory: (config: ConfigService) => {
                const url = '...'
                const protoPath = '...'
                return googleCloudGrpcConfigurator({
                    transport: Transport.GRPC,
                    options: {
                        url: url,
                        protoPath: protoPath,
                        package: GRPC_PACKAGE_NAME,
                    },
                });
            },
        }
    ]
})

Problem description

When I get 14 UNAVAILABLE I see only one attempt and I don't know if it's okay, but I expected to see 4 tries.

Reproduction steps

Environment

Additional context (logs)

[18:00:16] Starting compilation in watch mode...

[18:00:21] Found 0 errors. Watching for file changes.

D 2023-02-02T14:00:22.184Z | index | Loading @grpc/grpc-js version 1.8.7
D 2023-02-02T14:00:22.695Z | index | Loading @grpc/grpc-js version 1.7.1

[Nest] 19373  - 02/02/2023, 18:00:22     LOG [NestFactory] Starting Nest application...
[Nest] 19373  - 02/02/2023, 18:00:22     LOG [InstanceLoader] CommonModule dependencies initialized +75ms
[Nest] 19373  - 02/02/2023, 18:00:22     LOG [InstanceLoader] ConfigHostModule dependencies initialized +0ms
[Nest] 19373  - 02/02/2023, 18:00:22     LOG [InstanceLoader] ConfigModule dependencies initialized +0ms
[Nest] 19373  - 02/02/2023, 18:00:23     LOG [InstanceLoader] AppModule dependencies initialized +165ms

...

prisma:info Starting a postgresql pool with 17 connections.

...

D 2023-02-02T14:00:23.836Z | resolving_load_balancer | dns:0.0.0.0:3006 IDLE -> IDLE
D 2023-02-02T14:00:23.837Z | connectivity_state | (2) dns:0.0.0.0:3006 IDLE -> IDLE
D 2023-02-02T14:00:23.838Z | dns_resolver | Resolver constructed for target dns:0.0.0.0:3006
D 2023-02-02T14:00:23.838Z | channel | (2) dns:0.0.0.0:3006 Channel constructed with options {
  "grpc.service_config": "{\"methodConfig\":[{\"name\":[{\"service\":\"push_notification\"}],\"retryPolicy\":{\"maxAttempts\":4,\"initialBackoff\":\"1s\",\"maxBackoff\":\"10s\",\"backoffMultiplier\":2,\"retryableStatusCodes\":[\"UNAVAILABLE\"]}}]}"
}
D 2023-02-02T14:00:23.843Z | channel_stacktrace | (2) Channel constructed 
    at new InternalChannel (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/internal-channel.ts:287:19)
    at new ChannelImplementation (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/channel.ts:116:28)
    at new Client (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/client.ts:158:30)
    at new ServiceClientImpl (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/make-client.ts:129:3)
    at ClientGrpcProxy.createClientByServiceName (/Users/user/Work/batch-job/node_modules/@nestjs/microservices/client/client-grpc.js:69:28)
    at ClientGrpcProxy.getService (/Users/user/Work/batch-job/node_modules/@nestjs/microservices/client/client-grpc.js:32:33)
    at PushNotificationClient.onModuleInit (/Users/user/Work/batch-job/src/client/pushNotification.client.ts:23:25)
    at MapIterator.iteratee (/Users/user/Work/batch-job/node_modules/@nestjs/core/hooks/on-module-init.hook.js:22:43)
    at MapIterator.next (/Users/user/Work/batch-job/node_modules/iterare/src/map.ts:9:39)
    at IteratorWithOperators.next (/Users/user/Work/batch-job/node_modules/iterare/src/iterate.ts:19:28)
D 2023-02-02T14:00:23.844Z | resolving_load_balancer | dns:0.0.0.0:3006 IDLE -> IDLE
D 2023-02-02T14:00:23.844Z | connectivity_state | (3) dns:0.0.0.0:3006 IDLE -> IDLE
D 2023-02-02T14:00:23.845Z | dns_resolver | Resolver constructed for target dns:0.0.0.0:3006
D 2023-02-02T14:00:23.845Z | channel | (3) dns:0.0.0.0:3006 Channel constructed with options {
  "grpc.service_config": "{\"methodConfig\":[{\"name\":[{\"service\":\"email_notification\"}],\"retryPolicy\":{\"maxAttempts\":4,\"initialBackoff\":\"1s\",\"maxBackoff\":\"10s\",\"backoffMultiplier\":2,\"retryableStatusCodes\":[\"UNAVAILABLE\"]}}]}"
}
D 2023-02-02T14:00:23.847Z | channel_stacktrace | (3) Channel constructed 
    at new InternalChannel (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/internal-channel.ts:287:19)
    at new ChannelImplementation (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/channel.ts:116:28)
    at new Client (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/client.ts:158:30)
    at new ServiceClientImpl (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/make-client.ts:129:3)
    at ClientGrpcProxy.createClientByServiceName (/Users/user/Work/batch-job/node_modules/@nestjs/microservices/client/client-grpc.js:69:28)
    at ClientGrpcProxy.getService (/Users/user/Work/batch-job/node_modules/@nestjs/microservices/client/client-grpc.js:32:33)
    at EmailNotificationClient.onModuleInit (/Users/user/Work/batch-job/src/client/emailNotification.client.ts:24:53)
    at MapIterator.iteratee (/Users/user/Work/batch-job/node_modules/@nestjs/core/hooks/on-module-init.hook.js:22:43)
    at MapIterator.next (/Users/user/Work/batch-job/node_modules/iterare/src/map.ts:9:39)
    at IteratorWithOperators.next (/Users/user/Work/batch-job/node_modules/iterare/src/iterate.ts:19:28)

...

[Nest] 19373  - 02/02/2023, 18:00:23     LOG [NestApplication] Nest application successfully started +740ms
D 2023-02-02T14:00:55.226Z | channel | (2) dns:0.0.0.0:3006 createResolvingCall [0] method="/push_notification.PushNotificationService/sendUserNotification", deadline=Infinity
D 2023-02-02T14:00:55.232Z | resolving_call | [0] Created
D 2023-02-02T14:00:55.234Z | resolving_call | [0] Deadline: Infinity
D 2023-02-02T14:00:55.235Z | resolving_call | [0] start called
D 2023-02-02T14:00:55.237Z | dns_resolver | Returning IP address for target dns:0.0.0.0:3006
D 2023-02-02T14:00:55.238Z | resolving_load_balancer | dns:0.0.0.0:3006 IDLE -> CONNECTING
D 2023-02-02T14:00:55.238Z | connectivity_state | (2) dns:0.0.0.0:3006 IDLE -> CONNECTING
D 2023-02-02T14:00:55.239Z | channel | (2) dns:0.0.0.0:3006 callRefTimer.ref | configSelectionQueue.length=1 pickQueue.length=0
D 2023-02-02T14:00:55.239Z | resolving_call | [0] startRead called
D 2023-02-02T14:00:55.250Z | resolving_call | [0] write() called with message of length 29
D 2023-02-02T14:00:55.251Z | resolving_call | [0] halfClose called
D 2023-02-02T14:00:55.253Z | pick_first | Connect to address list 0.0.0.0:3006
D 2023-02-02T14:00:55.257Z | subchannel | (8) 0.0.0.0:3006 Subchannel constructed with options {
  "grpc.service_config": "{\"methodConfig\":[{\"name\":[{\"service\":\"push_notification\"}],\"retryPolicy\":{\"maxAttempts\":4,\"initialBackoff\":\"1s\",\"maxBackoff\":\"10s\",\"backoffMultiplier\":2,\"retryableStatusCodes\":[\"UNAVAILABLE\"]}}]}"
}
D 2023-02-02T14:00:55.258Z | subchannel_refcount | (8) 0.0.0.0:3006 refcount 0 -> 1
D 2023-02-02T14:00:55.258Z | subchannel_refcount | (8) 0.0.0.0:3006 refcount 1 -> 2
D 2023-02-02T14:00:55.258Z | pick_first | Start connecting to subchannel with address 0.0.0.0:3006
D 2023-02-02T14:00:55.258Z | pick_first | IDLE -> CONNECTING
D 2023-02-02T14:00:55.259Z | resolving_load_balancer | dns:0.0.0.0:3006 CONNECTING -> CONNECTING
D 2023-02-02T14:00:55.259Z | channel | (2) dns:0.0.0.0:3006 callRefTimer.unref | configSelectionQueue.length=1 pickQueue.length=0
D 2023-02-02T14:00:55.259Z | connectivity_state | (2) dns:0.0.0.0:3006 CONNECTING -> CONNECTING
D 2023-02-02T14:00:55.259Z | subchannel | (8) 0.0.0.0:3006 IDLE -> CONNECTING
D 2023-02-02T14:00:55.260Z | pick_first | CONNECTING -> CONNECTING
D 2023-02-02T14:00:55.260Z | resolving_load_balancer | dns:0.0.0.0:3006 CONNECTING -> CONNECTING
D 2023-02-02T14:00:55.260Z | connectivity_state | (2) dns:0.0.0.0:3006 CONNECTING -> CONNECTING
D 2023-02-02T14:00:55.267Z | channel | (2) dns:0.0.0.0:3006 createRetryingCall [1] method="/push_notification.PushNotificationService/sendUserNotification"
D 2023-02-02T14:00:55.269Z | retrying_call | [1] start called
D 2023-02-02T14:00:55.269Z | channel | (2) dns:0.0.0.0:3006 createLoadBalancingCall [2] method="/push_notification.PushNotificationService/sendUserNotification"
D 2023-02-02T14:00:55.270Z | retrying_call | [1] Created child call [2] for attempt 1
D 2023-02-02T14:00:55.270Z | load_balancing_call | [2] start called
D 2023-02-02T14:00:55.270Z | load_balancing_call | [2] Pick result: QUEUE subchannel: null status: undefined undefined
D 2023-02-02T14:00:55.270Z | channel | (2) dns:0.0.0.0:3006 callRefTimer.ref | configSelectionQueue.length=0 pickQueue.length=1
D 2023-02-02T14:00:55.271Z | retrying_call | [1] startRead called
D 2023-02-02T14:00:55.271Z | load_balancing_call | [2] startRead called
D 2023-02-02T14:00:55.272Z | retrying_call | [1] write() called with message of length 34
D 2023-02-02T14:00:55.272Z | load_balancing_call | [2] write() called with message of length 34
D 2023-02-02T14:00:55.272Z | retrying_call | [1] halfClose called
D 2023-02-02T14:00:55.277Z | subchannel | (8) 0.0.0.0:3006 CONNECTING -> TRANSIENT_FAILURE
D 2023-02-02T14:00:55.277Z | pick_first | CONNECTING -> TRANSIENT_FAILURE
D 2023-02-02T14:00:55.277Z | resolving_load_balancer | dns:0.0.0.0:3006 CONNECTING -> TRANSIENT_FAILURE
D 2023-02-02T14:00:55.277Z | channel | (2) dns:0.0.0.0:3006 callRefTimer.unref | configSelectionQueue.length=0 pickQueue.length=0
D 2023-02-02T14:00:55.277Z | load_balancing_call | [2] Pick result: TRANSIENT_FAILURE subchannel: null status: 14 No connection established
D 2023-02-02T14:00:55.278Z | load_balancing_call | [2] ended with status: code=14 details="No connection established"
D 2023-02-02T14:00:55.278Z | retrying_call | [1] Received status from child [2]
D 2023-02-02T14:00:55.278Z | retrying_call | [1] state=TRANSPARENT_ONLY handling status with progress PROCESSED from child [2] in state ACTIVE
D 2023-02-02T14:00:55.278Z | retrying_call | [1] ended with status: code=14 details="No connection established"
D 2023-02-02T14:00:55.278Z | connectivity_state | (2) dns:0.0.0.0:3006 CONNECTING -> TRANSIENT_FAILURE
D 2023-02-02T14:00:55.278Z | resolving_call | [0] ended with status: code=14 details="No connection established"
D 2023-02-02T14:00:55.322Z | resolving_call | [0] cancelWithStatus code: 1 details: "Cancelled on client"
D 2023-02-02T14:00:55.324Z | retrying_call | [1] cancelWithStatus code: 1 details: "Cancelled on client"
D 2023-02-02T14:00:55.324Z | retrying_call | [1] ended with status: code=1 details="Cancelled on client"
D 2023-02-02T14:00:55.324Z | load_balancing_call | [2] cancelWithStatus code: 1 details: "Cancelled on client"
[Nest] 19373  - 02/02/2023, 18:00:55   ERROR [ExceptionsHandler] 14 UNAVAILABLE: No connection established
Error: 14 UNAVAILABLE: No connection established
    at callErrorFromStatus (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/call.ts:81:17)
    at Object.onReceiveStatus (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/client.ts:356:55)
    at Object.onReceiveStatus (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/client-interceptors.ts:455:34)
    at Object.onReceiveStatus (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/client-interceptors.ts:417:48)
    at /Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/resolving-call.ts:111:24
    at processTicksAndRejections (node:internal/process/task_queues:78:11)
for call at
    at ServiceClientImpl.makeUnaryRequest (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/client.ts:326:30)
    at ServiceClientImpl.sendUserNotification (/Users/user/Work/batch-job/node_modules/@grpc/grpc-js/src/make-client.ts:189:15)
    at Observable._subscribe (/Users/user/Work/batch-job/node_modules/@nestjs/microservices/client/client-grpc.js:177:48)
    at Observable._trySubscribe (/Users/user/Work/batch-job/node_modules/rxjs/src/internal/Observable.ts:245:19)
    at /Users/user/Work/batch-job/node_modules/rxjs/src/internal/Observable.ts:235:18
    at Object.errorContext (/Users/user/Work/batch-job/node_modules/rxjs/src/internal/util/errorContext.ts:29:5)
    at Observable.subscribe (/Users/user/Work/batch-job/node_modules/rxjs/src/internal/Observable.ts:221:5)
    at /Users/user/Work/batch-job/node_modules/rxjs/src/internal/Observable.ts:473:12
    at new Promise (<anonymous>)
    at Observable.toPromise (/Users/user/Work/batch-job/node_modules/rxjs/src/internal/Observable.ts:471:12)
D 2023-02-02T14:00:56.239Z | resolving_load_balancer | dns:0.0.0.0:3006 TRANSIENT_FAILURE -> TRANSIENT_FAILURE
D 2023-02-02T14:00:56.240Z | connectivity_state | (2) dns:0.0.0.0:3006 TRANSIENT_FAILURE -> TRANSIENT_FAILURE
D 2023-02-02T14:00:56.261Z | subchannel | (8) 0.0.0.0:3006 TRANSIENT_FAILURE -> IDLE
D 2023-02-02T14:00:56.262Z | subchannel_refcount | (8) 0.0.0.0:3006 refcount 2 -> 1
D 2023-02-02T14:00:56.262Z | pick_first | TRANSIENT_FAILURE -> IDLE
D 2023-02-02T14:00:56.262Z | resolving_load_balancer | dns:0.0.0.0:3006 TRANSIENT_FAILURE -> IDLE
D 2023-02-02T14:00:56.263Z | connectivity_state | (2) dns:0.0.0.0:3006 TRANSIENT_FAILURE -> IDLE
D 2023-02-02T14:01:05.256Z | subchannel_refcount | (8) 0.0.0.0:3006 refcount 1 -> 0
murgatroid99 commented 1 year ago

It looks like that's almost right, and there's just a small error in the configuration: the logs show that the method config is using the name {"service":"push_notification"} but that you are making a request to the method /push_notification.PushNotificationService/sendUserNotification. For that method name, the service name should be push_notification.PushNotificationService.

i7N3 commented 1 year ago

Thank you @murgatroid99. It works 👍