Modify data post-serialization before going out to stream?

theogravity commented 2 years ago

I recently made an addition to pino-socket with a custom callback that allows me to modify the data going into it before writing to the socket.

However, I noticed that in async mode, the data object may contain a batch of log entries as opposed to one. I'd like to use async mode, but be able to manipulate the serialized data before it gets written to the buffer (which would then go out to the transport).

Is there a way to do that? From a performance perspective, I'd rather do that modification at the aforementioned stage instead of the custom callback as I'd have to convert the buffer to a string, then split based on the newline, and then prepend data to each, before re-joining and converting to a buffer.

My use-case is this:

The DataDog TCP format requires that an api key be pre-pended to each log entry:

<api key> <pino serialized log entry>\n

Here's what I currently have to do:

/* eslint-disable */
// https://github.com/pinojs/pino-pretty#handling-non-serializable-options

module.exports = (opts) => {
  return require('pino-socket')({
    ...opts,
    onBeforeDataWrite: (data) => {
      return prependApiKey(opts.dataDogApiKey, data)
    }
  })
}

// This is the area that I would like to optimize
function prependApiKey(dataDogApiKey, data) {
  return Buffer.from(
    data.toString()
     // remove trailing newline as it would result in a blank entry on the final join
    .trim()
    .split('\n')
    .map((logEntry) => dataDogApiKey + ' ' + logEntry)
    .join('\n')
  )
}

mcollina commented 2 years ago

What's your complete setup? It seems you are passing data through pino-pretty and then pino-socket, is that correct?

Note that pino shines in producing logs that are in JSON format and easily parseable.

theogravity commented 2 years ago

What's your complete setup? It seems you are passing data through pino-pretty and then pino-socket, is that correct?

Note that pino shines in producing logs that are in JSON format and easily parseable.

I'm not passing through pretty.

The setup looks like this:

const  logger = pino({
    level: process.env.LOG_LEVEL || 'trace',
    formatters: {
      level: (level) => {
        // format the level field to be datadog-compat
        return { level };
      }
    }
  }, pino.transport({
    target: join(__dirname, 'pino-datadog-logger.js'),
    options: {
      secure: true,
      address: 'intake.logs.datadoghq.com',
      port: 443,
      mode: 'tcp',
      dataDogApiKey: process.env.DATADOG_API_KEY,
    }
  }))

// pino-datadog-logger.js
/* eslint-disable */
// https://github.com/pinojs/pino-pretty#handling-non-serializable-options

module.exports = (opts) => {
  return require('pino-socket')({
    ...opts,
    onBeforeDataWrite: (data) => {
      return prependApiKey(opts.dataDogApiKey, data)
    }
  })
}

// re-visit this function as we may not need to do this messy conversion
// in the future if a fix is provided by the pino team
// https://github.com/pinojs/pino/issues/1511
function prependApiKey(dataDogApiKey, data) {
  return Buffer.from(
    data.toString()
     // remove trailing newline as it would result in a blank entry on the final join
    .trim()
    .split('\n')
    .map((logEntry) => dataDogApiKey + ' ' + logEntry)
    .join('\n')
  )
}

If I do the following:

    onBeforeDataWrite: (data) => {
      console.log('*****')
      console.log(data.toString())
      return prependApiKey(opts.dataDogApiKey, data)
    }

I'll get multiple entries into one on app startup:

*****
{"level":"info","time":1659863440732,"pid":66537,"hostname":"theo-switchboard.local","msg":"session_writer redis client connected to redis db"}
{"level":"info","time":1659863440735,"pid":66537,"hostname":"theo-switchboard.local","msg":"session_writer redis client ready"}
{"level":"info","time":1659863440737,"pid":66537,"hostname":"theo-switchboard.local","msg":"session_reader redis client connected to redis db"}
{"level":"info","time":1659863440738,"pid":66537,"hostname":"theo-switchboard.local","msg":"session_reader redis client ready"}
{"level":"info","time":1659863440740,"pid":66537,"hostname":"theo-switchboard.local","msg":"Auth API server started on port 3030"}

mcollina commented 2 years ago

pino.transform() returns a stream, I think you could put a Transform (or syncthrough, or minipass) in between to perform this manipulation (make sure to use objectMode) before sending it to the transport.

Note that pino produces newline delimited JSON, I'm not adding a new hook to manipulating the data before shipping it out.

(I see a comment on pino-pretty in your code, what does that reference to? Is it relevant for this?)

jsumners commented 2 years ago

Is there any particular reason you don't use https://github.com/DataDog/dd-trace-js?

theogravity commented 2 years ago

Transform

That tip really helped! Thank you. Doing the following converts the data into a series of objects:

const p = pino({
  level: process.env.LOG_LEVEL || 'trace',
  formatters: {
    level: (level) => {
      // format the level field to be datadog-compat
      return { level };
    }
  }
}, pino.transport({
  pipeline: [
    {
      target: join(__dirname, 'transformer.js'),
      options: {},
    },
    {
    target: join(__dirname, 'object-socket.js'),
    options: {
      secure: true,
      address: 'intake.logs.datadoghq.com',
      port: 443,
      mode: 'tcp',
      dataDogApiKey: process.env.DATADOG_API_KEY,
    },
  }]
}))

// transformer.js
const build = require('pino-abstract-transport')
const { pipeline, Transform } = require('stream')

module.exports = async function (options) {
  return build(function (source) {
    const myTransportStream = new Transform({
      // Make sure autoDestroy is set,
      // this is needed in Node v12 or when using the
      // readable-stream module.
      autoDestroy: true,

      objectMode: true,
      transform (chunk, enc, cb) {
        console.log(chunk)
        // modifies the payload somehow
        chunk.service = 'pino'

        // stringify the payload again
        this.push(JSON.stringify(chunk))
        cb()
      }
    })
    pipeline(source, myTransportStream, () => {})
    return myTransportStream
  }, {
    // This is needed to be able to pipeline transports.
    enablePipelining: true
  })
}

// object-socket.js
module.exports = (opts) => {
  return require('pino-socket')({
    ...opts,
    onBeforeDataWrite: (data) => {
      console.log(typeof data)
      return data
    }
  })
}

output:

{
  level: 'info',
  time: 1659900303142,
  pid: 75617,
  hostname: 'theo-switchboard.local',
  msg: 'session_writer redis client connected to redis db'
}
{
  level: 'info',
  time: 1659900303158,
  pid: 75617,
  hostname: 'theo-switchboard.local',
  msg: 'session_writer redis client ready'
}
{
  level: 'info',
  time: 1659900303160,
  pid: 75617,
  hostname: 'theo-switchboard.local',
  msg: 'session_reader redis client connected to redis db'
}
{
  level: 'info',
  time: 1659900303161,
  pid: 75617,
  hostname: 'theo-switchboard.local',
  msg: 'session_reader redis client ready'
}
{
  level: 'info',
  time: 1659900303163,
  pid: 75617,
  hostname: 'theo-switchboard.local',
  msg: 'Auth API server started on port 3030'
}
object
object
object
object
object

Regarding the prettier link. If you attempt to define the onBeforeDataWrite within the transport options itself, you'll get the following:

DOMException [DataCloneError]: (e)=>{
} could not be cloned.

I referenced the link because it notes about items not serializable and uses the same solution:

Using the new [pino v7+ transports](https://getpino.io/#/docs/transports?id=v7-transports) not all options are serializable, for example if you want to use messageFormat as a function you will need to wrap pino-pretty in a custom module.

Executing main.js below will log a colorized hello world message using a custom function messageFormat:

theogravity commented 2 years ago

Is there any particular reason you don't use https://github.com/DataDog/dd-trace-js?

dd-trace-js only generates and sends traces, not logs. You have to send logs separately from a trace, and the mechanism to send logs out can very (as you've seen it can be via TCP, HTTPS, console output, or even logfiles), so the library does not facilitate log sending.

There are logging plugins (such as pino) that you can use with it so that the library can stamp the trace id to any log output so when logs are sent to datadog, datadog can match the logs against the trace. For example, if you use the pino plugin, it will hijack the module to add the trace id before a trace goes out when pino generates the log object.

theogravity commented 2 years ago

I ended up writing my own pino datadog transport because there were tradeoffs between using a TCP (+TLS) vs HTTPS transport.

The biggest one for HTTPS is that it can compress log data before sending it, whereas you can't with the TCP method. Even the datadog agent transitioned from sending over raw TCP to HTTPS in their latest version of the agent and recommended that approach over TCP.

// https://docs.datadoghq.com/agent/logs/log_transport?tab=https
For Agent v6.19+/v7.19+, the default transport used for your logs is compressed HTTPS instead of TCP for the previous versions. When the Agent starts, if log collection is enabled, it runs a HTTPS connectivity test. If successful, then the Agent uses the compressed HTTPS transport, otherwise the Agent falls back to a TCP transport.

theogravity commented 2 years ago

Here's the final working solution (thank you everyone!):

Feel free to close the issue if you have no further questions for me.

const p = pino({
  level: process.env.LOG_LEVEL || 'trace',
  formatters: {
    level: (level) => {
      // format the level field to be datadog-compat
      return { level };
    }
  }
}, pino.transport({
  pipeline: [
    {
      target: join(__dirname, 'transformer.js'),
      options: {},
    },
    {
    target: join(__dirname, 'pino-datadog-logger.js'),
    options: {
      secure: true,
      address: 'intake.logs.datadoghq.com',
      port: 443,
      mode: 'tcp',
      dataDogApiKey: process.env.DATADOG_API_KEY,
    },
  }]
}))

// transformer.js
const build = require('pino-abstract-transport')
const { pipeline, Transform } = require('stream')

module.exports = async function (options) {
  return build(function (source) {
    const myTransportStream = new Transform({
      // Make sure autoDestroy is set,
      // this is needed in Node v12 or when using the
      // readable-stream module.
      autoDestroy: true,

      objectMode: true,
      transform (chunk, enc, cb) {
        // stringify the payload again
        this.push(JSON.stringify(chunk))
        cb()
      }
    })
    pipeline(source, myTransportStream, () => {})
    return myTransportStream
  }, {
    // This is needed to be able to pipeline transports.
    enablePipelining: true
  })
}

// pino-datadog-logger.js
/* eslint-disable */
// https://github.com/pinojs/pino-pretty#handling-non-serializable-options
// onBeforeDataWrite / functions as options on the transport config is not serializable so we have to create this file
// to register it

module.exports = (opts) => {
  return require('pino-socket')({
    ...opts,
    onBeforeDataWrite: (data) => {
      return Buffer.from(opts.dataDogApiKey + ' ' + data, 'utf8');
    }
  })
}

Output:

mcollina commented 2 years ago

Glad to be of help!

github-actions[bot] commented 2 years ago

This issue has been automatically locked since there has not been any recent activity after it was closed. Please open a new issue for related bugs.

pinojs / pino

Modify data post-serialization before going out to stream? #1511