open-telemetry / opentelemetry-js-contrib

OpenTelemetry instrumentation for JavaScript modules
https://opentelemetry.io
Apache License 2.0
685 stars 502 forks source link

Santinize db.statement for static queries #1744

Open dineshg13 opened 11 months ago

dineshg13 commented 11 months ago

What version of OpenTelemetry are you using?

My package.json

{
  "name": "js",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "dependencies": {
    "@opentelemetry/api": "^1.6.0",
    "@opentelemetry/auto-instrumentations-node": "^0.39.4",
    "@opentelemetry/instrumentation-pg": "^0.36.2",
    "body-parser": "^1.20.2",
    "express": "^4.18.2",
    "pg": "^8.11.3",
    "sqlite3": "^5.1.6"
  }
}

What version of Node are you using?

v20.6.1

What did you do?

When we use opentelemetry-instrument to automatically instrument a express app connected to postgres database, we notice that it doesn't sanitize queries with constant parameters, even though it collects db.statement by default. It goes against OTel semantic conventions .

What did you expect to see?

I expect to see db.statement sanitized. But am seeing below trace for static queries (/select endpoint in below code). Sanitization works well for queries with params (/users/John) in below code.

What did you see instead?

{
  traceId: '7fed3a44c4611f0035e5091c9a6bbed7',
  parentId: '6666ad513beeb304',
  traceState: undefined,
  name: 'pg.query:SELECT postgres',
  id: '8e230a83eb8f174d',
  kind: 2,
  timestamp: 1697586032703000,
  duration: 2141.875,
  attributes: {
    'db.system': 'postgresql',
    'db.name': 'postgres',
    'db.connection_string': 'postgresql://localhost:5432/postgres',
    'net.peer.name': 'localhost',
    'net.peer.port': 5432,
    'db.user': 'dinesh.gurumurthy',
    'db.statement': "select * from users where name = 'John'"
  },
  status: { code: 0 },
  events: [],
  links: []
}
{
  traceId: '7fed3a44c4611f0035e5091c9a6bbed7',
  parentId: undefined,
  traceState: undefined,
  name: 'GET /select',
  id: '6666ad513beeb304',
  kind: 1,
  timestamp: 1697586032681000,
  duration: 26401.625,
  attributes: {
    'http.url': 'http://localhost:3000/select',
    'http.host': 'localhost:3000',
    'net.host.name': 'localhost',
    'http.method': 'GET',
    'http.scheme': 'http',
    'http.target': '/select',
    'http.user_agent': 'curl/8.1.2',
    'http.flavor': '1.1',
    'net.transport': 'ip_tcp',
    'net.host.ip': '::ffff:127.0.0.1',
    'net.host.port': 3000,
    'net.peer.ip': '::ffff:127.0.0.1',
    'net.peer.port': 51169,
    'http.status_code': 200,
    'http.status_text': 'OK',
    'http.route': '/select'
  },
  status: { code: 0 },
  events: [],
  links: []
}

Additional context

Code

const express = require('express');
const bodyParser = require('body-parser');
const db = require('./database');

const app = express();
const PORT = 3000;

app.use(bodyParser.json());

// CREATE: Add a new user
app.post('/users', async (req, res) => {
    const { name, email } = req.body;
    try {
        const { rows } = await db.query("INSERT INTO users (name, email) VALUES ($1, $2) RETURNING id", [name, email]);
        res.json({ id: rows[0].id, name, email });
    } catch (err) {
        res.status(500).json(err);
    }
});

// READ: Get all users
app.get('/users', async (req, res) => {
    try {
        const { rows } = await db.query("SELECT * FROM users");
        res.json(rows);
    } catch (err) {
        res.status(500).json(err);
    }
});

// READ: Get user by name
app.get('/users/:name', async (req, res) => {
    try {
        const { rows } = await db.query("SELECT * FROM users WHERE name = $1", [req.params.name]);
        if (!rows.length) {
            return res.status(404).json({ message: "User not found" });
        }
        res.json(rows[0]);
    } catch (err) {
        res.status(500).json(err);
    }
});

// READ: Get user by name
app.get('/select', async (req, res) => {
    try {
        sql = "select * from users where name = 'John'"
        const { rows } = await db.query(sql)
        if (!rows.length) {
            return res.status(404).json({ message: "User not found" });
        }
        res.json(rows[0]);
    } catch (err) {
        res.status(500).json(err);
    }
});
app.listen(PORT, () => {
    console.log(`Server started on http://localhost:${PORT}`);
});
dyladan commented 11 months ago

Right now we don't sanitize static queries because it would require parsing the SQL itself. Queries with parameters are properly sanitized because we know what they are. The only way to solve this would be to disable collecting ALL static queries by default and making them opt-in.

Also note that the semconv is a recommendation, not a requirement in this case.

dyladan commented 11 months ago

I think we should engage with the semconv group on what we should do in this case. I think its unlikely that they expect SDKs to parse SQL statements, but I also don't know if they intended to disable collection of static queries.

dineshg13 commented 11 months ago

@dyladan That make sense. I found similar issue with Python SDK https://github.com/open-telemetry/opentelemetry-python-contrib/issues/2003 . While Java sdk is sanitizing static queries. we Should address the inconsistencies across languages.

david-luna commented 11 months ago

There are other instrumentations moving to an opt-in approach. In this PR the intent is to let the user provide a serializer for the statement defaulting to a noop which does not serialize at all

https://github.com/open-telemetry/opentelemetry-js-contrib/pull/1748