Performance slowdown using http-instrumentation

dsope05 commented 11 months ago

What happened?

Steps to Reproduce

create a server running on main thread, offloads requests to node worker threads. implement open telemetry on main thread, requests/sec is reduced by 80% when http-instrumentation is applied to the main thread.

Expected Result

http-instrumentation shouldn't decrease performance by 80% when implemented.

Actual Result

http-instrumentation decreases requests/second performance by 80%

Additional Details

OpenTelemetry Setup Code

const os = require('os');
const { isMainThread } = require('worker_threads');
const { trace, propagation } = require('@opentelemetry/api');
const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base');
const {
  CollectorTraceExporter,
} = require('@opentelemetry/exporter-collector-proto');
const { registerInstrumentations } = require('@opentelemetry/instrumentation');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const {
  FastifyInstrumentation,
} = require('@opentelemetry/instrumentation-fastify');
const {
  SemanticResourceAttributes,
} = require('@opentelemetry/semantic-conventions');
const { Resource } = require('@opentelemetry/resources');
const {
  CompositePropagator,
  W3CTraceContextPropagator,
} = require('@opentelemetry/core');
const {
  B3Propagator,
  B3InjectEncoding,
} = require('@opentelemetry/propagator-b3');

const fastify = require('../../server/fastify');
const { serviceName, version } = require('../server-info');
const { register } = require('../../utils/register-to-close');
const { SemanticELFAttributes } = require('../SemanticELFAttributes');

propagation.setGlobalPropagator(
  new CompositePropagator({
    propagators: [
      new B3Propagator({ injectEncoding: B3InjectEncoding.MULTI_HEADER }),
      new W3CTraceContextPropagator(),
    ],
  })
);

const provider = new NodeTracerProvider({
  resource: new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: serviceName,
    [SemanticResourceAttributes.SERVICE_NAMESPACE]: 'xxxxx',
    [SemanticResourceAttributes.SERVICE_INSTANCE_ID]: os.hostname(),
    [SemanticResourceAttributes.SERVICE_VERSION]: version,
    [SemanticELFAttributes.SERVICE_CAR_ID]: 'xxxxxx',
    [SemanticELFAttributes.DEPLOYMENT_ENVIRONMENT]:
      process.env.EPAAS_ENV
      || (process.env.NODE_ENV === 'development' ? 'E0' : undefined),
  }),
});

provider.addSpanProcessor(
  new BatchSpanProcessor(
    new CollectorTraceExporter({
      // url: is configured via OTEL_EXPORTER_OTLP_TRACES_ENDPOINT env var
      serviceName,
      concurrencyLimit: 10, // pending requests limit, dropped after that
    }),
    {
      // basing numbers off of 700rps
      // past this size spans are dropped
      maxQueueSize: 400,
      // maximum batch size of every export (maxExportBatchSize <= maxQueueSize)
      maxExportBatchSize: 100,
      scheduledDelayMillis: 500, // interval between two consecutive exports
      exportTimeoutMillis: 15e3, // cancelled on failure
    }
  )
);

const tracer = trace.getTracer(serviceName, version);

const httpInstrumentationOptions = {
  ignoreIncomingPaths: ['/ReadMetrics.v1', '/ReadResponseCode.v1', '/favicon.ico'],
};

if (isMainThread) {
  // Initialie instrumentations on worker only
  const { httpRequestHook, buildHTTPResponseHook } = require('../hooks');

  Object.assign(httpInstrumentationOptions, {
    requestHook: httpRequestHook,
    responseHook: buildHTTPResponseHook(tracer),
  });

  registerInstrumentations({
    tracerProvider: provider,
    instrumentations: [
      new HttpInstrumentation(httpInstrumentationOptions),
      new FastifyInstrumentation({
        ignoreUrls: ['/ReadMetrics.v1', '/ReadResponseCode.v1', '/favicon.ico'],
        requestHook: (span, info) => {
          span.setAttribute('http.method', info.request.method);
        },
      }),
    ],
  });
}
  provider.register();
  register(() => provider.shutdown());

  fastify.log.info(`Tracing initialized for ${isMainThread ? 'server thread' : 'worker thread'}`);
module.exports = tracer;

package.json

{
  "name": "xxxx",
  "version": "1.2.7",
  "description": "",
  "main": "server.js",
  "scripts": {
    "test": "npm run test:lint && npm run test:unit",
    "test:unit": "echo test",
    "start": "node src/server",
    "test:lint": "eslint --ignore-path .gitignore --ext js,jsx,snap ."
  },
  "bin": {
    "server": "./bin/server.js"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@babel/polyfill": "^7.12.1",
    "@fastify/cors": "^8.2.1",
    "@fastify/helmet": "^10.1.0",
    "@opentelemetry/api": "^1.4.1",
    "@opentelemetry/core": "^1.5.0",
    "@opentelemetry/exporter-collector-proto": "^0.25.0",
    "@opentelemetry/exporter-prometheus": "^0.36.1",
    "@opentelemetry/instrumentation": "^0.41.2",
    "@opentelemetry/instrumentation-fastify": "^0.31.1",
    "@opentelemetry/instrumentation-http": "^0.45.1",
    "@opentelemetry/metrics": "^0.24.0",
    "@opentelemetry/propagator-b3": "^1.13.0",
    "@opentelemetry/resources": "^1.13.0",
    "@opentelemetry/sdk-metrics": "^1.10.1",
    "@opentelemetry/sdk-trace-base": "^1.10.0",
    "@opentelemetry/sdk-trace-node": "^1.10.1",
    "@opentelemetry/semantic-conventions": "^1.10.1",
    "decompress": "^4.2.1",
    "decompress-targz": "^4.1.1",
    "eslint": "^8.46.0",
    "etcd3": "^1.1.0",
    "fastify": "^4.17.0",
    "helmet": "^5.0.2",
    "ip": "^1.1.8",
    "jest": "^28.1.3",
    "jks-js": "^1.0.2",
    "node-fetch": "^2.6.7",
    "opentelemetry-node-metrics": "^3.0.0",
    "os": "^0.1.2",
    "path": "^0.12.7",
    "pino": "^8.14.1",
    "pino-opentelemetry-transport": "^0.2.0",
    "pino-pretty": "^10.2.0",
    "stream": "0.0.2",
    "util": "^0.12.4",
    "uuid": "^8.3.2"
  },
  "overrides": {
    "core-js": "3.28.0",
    "import-in-the-middle": "1.4.2",
    "protobufjs": "6.8.9"
  },
  "engines": {
    "node": "=18.x.x",
    "npm": "=9.x.x"
  }
}

Relevant log output

No response

zzadxz commented 9 months ago

Hey! I can help with this. May I be assigned?

pichlermarc commented 9 months ago

Hey! I can help with this. May I be assigned?

Great! It's yours, thanks for picking this up :slightly_smiling_face:

olafkrawczyk commented 5 months ago

Hello, any updates on this issue? I've noticed 2x slow down using open telemetry with node and graphql app

drodil commented 2 months ago

Do you happen to have any updates on this issue?

I've encountered the same and according to a stacktrace the problem might be in the incomingRequest method:

I am not sure if there's much that can be done as it's parsing the request in the Node internal server and that is for some reason very CPU-hungry.

WilixLead commented 1 month ago

Looks like it is produce big event loop lag in otl operations, but Im not sure. So I have prometeus metrics on nodejs app, and got screenshot:

In left side tracing enabled, then I just disable it and we see different image on right side. (I have 6 replica of app in kubernates running, that why we see many lines)

Some details about my code. It is Koa app and in Index file I have code like:

if (env.TRACING_ENABLED) {
    app.use(async (ctx, next) => {
      const tracer = trace.getTracer("myapp");

      const span = tracer.startSpan("http_request", {
        attributes: {
          "http.method": ctx.method,
          "http.path": ctx.path,
        },
      });

      try {
        await next();

        span.setAttribute("http.status_code", ctx.status);
      } catch (err) {
        span.recordException(err);
        span.setAttribute("http.status_code", 500);

        throw err;
      } finally {
        span.end();
      }
    });
  }

My tracing initialization like:

const traceExporter = new OTLPTraceExporter({
    url: env.TRACING_URL,
    compression: CompressionAlgorithm.GZIP,
  });

  const sdk = new NodeSDK({
    resource: new Resource({
      [ATTR_SERVICE_NAME]: env.SERVICE_NAME,
      [ATTR_SERVICE_VERSION]: env.VERSION,
    }),
    traceExporter,
    instrumentations: [
      getNodeAutoInstrumentations({
        "@opentelemetry/instrumentation-koa": {
          requestHook: (span, info) => {
            info.context.set("X-Request-Id", span.spanContext().traceId);
          },
        },
      }),
      new WinstonInstrumentation(),
      new BullMQInstrumentation(),
      new UndiciInstrumentation(), // for FetchAPI
      new SequelizeInstrumentation(),
    ],
  });

  try {
    sdk.start();
  } catch (err) {
    console.error(`Error starting OpenTelemetry SDK`, err);
  }

Also I think GZIP can affect on load. In next I will try disable GZIP and see what happen.

WilixLead commented 1 month ago

So yes. Problem in GZIP. I just disable it

drodil commented 1 month ago

@WilixLead nice find but with default configuration, the compression is set to NONE so it's probably not the only thing causing this. See https://github.com/open-telemetry/opentelemetry-js/blob/6515ed8098333646a63a74a8c0150cc2daf520db/experimental/packages/otlp-grpc-exporter-base/src/util.ts#L154

WilixLead commented 1 month ago

I know :) Just we use example founded in Internet with compression. Copy-paste coding...

mbrevda commented 1 month ago

I'm not sure the performance issues are limited to HTTP. Seeing as much as a ~40% decrease in requests p/ second on an express.js app. Disabling HTTP doesn't have much of an effect for me.

open-telemetry / opentelemetry-js