Automattic / mongoose

MongoDB object modeling designed to work in an asynchronous environment.
https://mongoosejs.com
MIT License
26.95k stars 3.84k forks source link

mongoose insertMany 30x slower than mongodb insertMany in some cases #14719

Open benthayer opened 4 months ago

benthayer commented 4 months ago

Prerequisites

Last performant version

unknown

Slowed down in version

8.4.3

Node.js version

20.9.0

🦥 Performance issue

We have a high throughput application where our performance is limited by the number of db writes. During our testing we found mongoose's insertMany to be extremely slow compared to the insertMany provided by the mongodb package.

We were able to achieve a maximum throughput of 1900 documents written per second on a single unindexed collection by using batches of 50 documents per insertMany, with 1000 concurrent insertmany calls and minPoolSize set to 1000.

With the mongodb package, we were able to achieve 58,000 writes per second by using a similar concurrency of 1000 and 200 documents per insertMany call. This means that for this part of our application we have to bypass mongoose and use the underlying mongodb api.

Using Collection.create, we were able to achieve only a maximum of 650 documents inserted per second.

Steps to Reproduce

This is the code I was using for testing. You'd have to replace the uri with your uri obviously.

With the same settings, mongoose performs worse. If we recreate and use our production collection (with 50+ fields of varying types) we get significantly lower throughput for mongooes but we're still able to get higher throughput for native insertManys. Additionally, if I set batchSize to be too high for mongoose, I get an out of memory error which may indicate something about what's causing the performance issue (or not?). I got the error by using await benchmarkBulkInsert(writeMongoose, 1000, 1000);

const dotenv = require('dotenv');
const mongoose = require('mongoose')

dotenv.config({
  path: '.env.mongo.benchmark',
});

const uri = process.env.DB_URI
const name = 'benchmark'

const documentSchema = new mongoose.Schema({
  foo: String,
});
const Collection = mongoose.model('Collection', documentSchema);
let NativeCollection;

async function connectExtra() {
    await mongoose.connect(uri, {
        retryWrites: true,
        dbName: name,
        minPoolSize: 1000,
        maxPoolSize: 1000
    });
    console.log(`[CONNECTED TO DB]: ${name}`);
    mongoose.connection.on('error', (dbError) => {
        throw new Error(`Couldn't connect to the database:${dbError}`);
    });

    const client = mongoose.connection.getClient();
    const db = client.db('benchmark');
    NativeCollection = db.collection('collections');
    return mongoose.connection;
}

async function writeNative(documents) {
  await NativeCollection.insertMany(documents, {forceServerObjectId: true});
}
async function writeMongoose(documents) {
  await Collection.insertMany(documents);
}

async function connectToDB() {
    await connectExtra(uri, name); // Ensure your db module handles this appropriately
}

function createDocument() {
    // Needs to match collection
    return {
      foo: 'bar'
    };
}
async function benchmarkBulkInsert(writeFunction, concurrencyLevel, batchSize) {
    let totalInserted = 0;
    const documents = [];
    for (let i = 0; i < batchSize; i++) {
        documents.push(createDocument());
    }

    const preDocumentCount = await Collection.countDocuments({});

    console.log(`Starting with ${preDocumentCount} documents in the database.`);

    const insertManyTask = async () => {
        try {
            await writeFunction(documents);
            return batchSize; // Return the number of inserted documents
        } catch (error) {
            console.error('Error during batch insert:', error);
            return 0; // Return zero if the insert fails
        }
    };

    const tasks = [];
    const endTime = Date.now() + 1000; // Run for 1 second

    const startTime = new Date();
    while (Date.now() < endTime) {
        if (tasks.length < concurrencyLevel) {
            tasks.push(insertManyTask());
        } else {
            const results = await Promise.all(tasks);
            totalInserted += results.reduce((acc, val) => acc + val, 0); // Sum up results
            tasks.length = 0; // Clear the array after all promises have resolved
        }
    }

    if (tasks.length > 0) {
        const results = await Promise.all(tasks);
        totalInserted += results.reduce((acc, val) => acc + val, 0); // Sum up remaining results
    }
    const endedAt = new Date();
    const runTime = (endedAt.getTime() - startTime.getTime()) / 1000
    console.log('Ran in ', runTime);

    const postDocumentCount = await Collection.countDocuments({});
    console.log(`Ending with ${postDocumentCount} documents in the database.`);

    console.log(`Inserted ${postDocumentCount - preDocumentCount} documents`);

    console.log(`Inserted ${totalInserted} documents`);

    console.log(`${(postDocumentCount - preDocumentCount) / runTime} Docs per Second`)
}

async function test() {
    await connectToDB();
    // Call w/ writeFunc, concurrency, batchSize
    // I get out of memory errors if I change batchSize to be too high for writeMongoose
    await benchmarkBulkInsert(writeMongoose, 1000, 50);
    await benchmarkBulkInsert(writeNative, 1000, 2000);
    process.exit(0);
}

test()

Expected Behavior

I would expect the throughput of mongoose's insertMany to exactly match the throughput of the native mongodb insertMany.

vkarpov15 commented 4 months ago

Mongoose's insertMany() won't exactly match the throughput of the MongoDB node driver because of defaults, casting, validation, etc. We'll see how we can improve the performance, but you can always use the lean option to bypass Mongoose's casting, validation, etc. using await Collection.insertMany(documents, { lean: true });